1 2 3 4 package net.nutch.net; 5 6 import java.net.URL ; 7 import java.net.MalformedURLException ; 8 import java.io.IOException ; 9 12 import java.util.List ; 13 import java.util.ArrayList ; 14 import java.util.Iterator ; 15 import java.util.logging.Logger ; 16 import net.nutch.util.LogFormatter; 17 18 import javax.xml.parsers.*; 19 import org.w3c.dom.*; 20 import org.apache.oro.text.regex.*; 21 22 import net.nutch.util.*; 23 24 35 public class RegexUrlNormalizer extends BasicUrlNormalizer 36 implements UrlNormalizer { 37 38 39 private static class Rule { 40 public Perl5Pattern pattern; 41 public String substitution; 42 } 43 44 private List rules; 45 private PatternMatcher matcher = new Perl5Matcher(); 46 47 50 public RegexUrlNormalizer() throws IOException , MalformedPatternException { 51 String filename = NutchConf.get("urlnormalizer.regex.file"); 52 URL url= NutchConf.class.getClassLoader().getResource(filename); 53 54 rules=readConfigurationFile(url.toString()); 55 } 56 57 58 public RegexUrlNormalizer(String filename) 59 throws IOException , MalformedPatternException { 60 rules = readConfigurationFile(filename); 62 } 63 64 65 67 public synchronized String regexNormalize(String urlString) { 68 Iterator i=rules.iterator(); 69 while(i.hasNext()) { 70 Rule r=(Rule) i.next(); 71 urlString = Util.substitute(matcher, r.pattern, 72 new Perl5Substitution(r.substitution), urlString, Util.SUBSTITUTE_ALL); } 74 return urlString; 75 } 76 77 80 public synchronized String normalize(String urlString) 81 throws MalformedURLException { 82 urlString = super.normalize(urlString); urlString = regexNormalize(urlString); 84 urlString = super.normalize(urlString); return urlString; 86 } 87 88 89 90 91 private static List readConfigurationFile(String filename) 92 throws IOException , MalformedPatternException { 93 94 Perl5Compiler compiler=new Perl5Compiler(); 95 List rules=new ArrayList (); 96 try { 97 98 LOG.info("loading " + filename); 99 Document doc = 101 DocumentBuilderFactory.newInstance().newDocumentBuilder() 102 .parse(filename); 103 Element root = doc.getDocumentElement(); 104 if (!"regex-normalize".equals(root.getTagName())) 105 LOG.severe("bad conf file: top-level element not <regex-normalize>"); 106 NodeList regexes = root.getChildNodes(); 107 for (int i = 0; i < regexes.getLength(); i++) { 108 Node regexNode = regexes.item(i); 109 if (!(regexNode instanceof Element)) 110 continue; 111 Element regex = (Element)regexNode; 112 if (!"regex".equals(regex.getTagName())) 113 LOG.warning("bad conf file: element not <regex>"); 114 NodeList fields = regex.getChildNodes(); 115 String patternValue = null; 116 String subValue = null; 117 for (int j = 0; j < fields.getLength(); j++) { 118 Node fieldNode = fields.item(j); 119 if (!(fieldNode instanceof Element)) 120 continue; 121 Element field = (Element)fieldNode; 122 if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) 123 patternValue = ((Text)field.getFirstChild()).getData(); 124 if ("substitution".equals(field.getTagName()) && field.hasChildNodes()) 125 subValue = ((Text)field.getFirstChild()).getData(); 126 if (!field.hasChildNodes()) 127 subValue = ""; 128 } 129 if (patternValue != null && subValue != null) { 130 Rule rule=new Rule(); 131 rule.pattern=(Perl5Pattern) compiler.compile(patternValue); 132 rule.substitution=subValue; 133 rules.add(rule); 134 } 135 } 136 137 } catch (Exception e) { 138 LOG.severe("error parsing " + filename +" conf file: " + e); 139 } 140 return rules; 141 } 142 143 144 public static void main(String args[]) 145 throws MalformedPatternException, IOException { 146 RegexUrlNormalizer normalizer = new RegexUrlNormalizer(); 147 Iterator i=normalizer.rules.iterator(); 148 while(i.hasNext()) { 149 Rule r=(Rule) i.next(); 150 System.out.print(r.pattern.getPattern() + " "); 151 System.out.println(r.substitution); 152 } 153 } 154 155 } 156 | Popular Tags |