KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > net > RegexUrlNormalizer


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.net;
5
6 import java.net.URL JavaDoc;
7 import java.net.MalformedURLException JavaDoc;
8 import java.io.IOException JavaDoc;
9 // import java.net.URI;
10
// import java.net.URISyntaxException;
11

12 import java.util.List JavaDoc;
13 import java.util.ArrayList JavaDoc;
14 import java.util.Iterator JavaDoc;
15 import java.util.logging.Logger JavaDoc;
16 import net.nutch.util.LogFormatter;
17
18 import javax.xml.parsers.*;
19 import org.w3c.dom.*;
20 import org.apache.oro.text.regex.*;
21
22 import net.nutch.util.*;
23
24 /** Allows users to do regex substitutions on all/any URLs that are encountered, which
25  * is useful for stripping session IDs from URLs.
26  *
27  * <p>This class must be specified as the URL normalizer to be used in <tt>nutch-site.xml</tt>
28  * or <tt>nutch-default.xml</tt>. To do this specify the <tt>urlnormalizer.class</tt> property to
29  * have the value: <tt>net.nutch.net.RegexUrlNormalizer</tt>. The <tt>urlnormalizer.regex.file</tt>
30  * property should also be set to the file name of an xml file which should contain the patterns
31  * and substitutions to be done on encountered URLs.</p>
32  *
33  * @author Luke Baker
34  */

35 public class RegexUrlNormalizer extends BasicUrlNormalizer
36   implements UrlNormalizer {
37
38     /** Class which holds a compiled pattern and its corresponding substition string. */
39     private static class Rule {
40       public Perl5Pattern pattern;
41       public String JavaDoc substitution;
42     }
43     
44     private List JavaDoc rules;
45     private PatternMatcher matcher = new Perl5Matcher();
46     
47     /** Default constructor which gets the file name from either <tt>nutch-site.xml</tt>
48       * or <tt>nutch-default.xml</tt> and reads that configuration file. It stores the regex patterns
49       * and corresponding substitutions in a List. The file should be in the CLASSPATH. */

50     public RegexUrlNormalizer() throws IOException JavaDoc, MalformedPatternException {
51       String JavaDoc filename = NutchConf.get("urlnormalizer.regex.file");
52       URL JavaDoc url= NutchConf.class.getClassLoader().getResource(filename);
53      
54       rules=readConfigurationFile(url.toString());
55     }
56     
57     /** Constructor which can be passed the file name, so it doesn't look in the configuration files for it. */
58     public RegexUrlNormalizer(String JavaDoc filename)
59       throws IOException JavaDoc, MalformedPatternException {
60       //URL url= NutchConf.class.getClassLoader().getResource(filename);
61
rules = readConfigurationFile(filename);
62     }
63     
64     
65     /** This function does the replacements by iterating through all the regex patterns.
66       * It accepts a string url as input and returns the altered string. */

67     public synchronized String JavaDoc regexNormalize(String JavaDoc urlString) {
68       Iterator JavaDoc i=rules.iterator();
69       while(i.hasNext()) {
70         Rule r=(Rule) i.next();
71         urlString = Util.substitute(matcher, r.pattern,
72           new Perl5Substitution(r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual substitution
73
}
74       return urlString;
75     }
76    
77     /** Normalizes any URLs by calling super.basicNormalize()
78       * and regexSub(). This is the function that gets called
79       * elsewhere in Nutch. */

80     public synchronized String JavaDoc normalize(String JavaDoc urlString)
81       throws MalformedURLException JavaDoc {
82         urlString = super.normalize(urlString); // run basicNormalize first to ready for regexNormalize
83
urlString = regexNormalize(urlString);
84         urlString = super.normalize(urlString); // make sure regexNormalize didn't screw up the URL
85
return urlString;
86   }
87   
88   
89   
90   /** Reads the configuration file and populates a List of Rules. */
91   private static List JavaDoc readConfigurationFile(String JavaDoc filename)
92     throws IOException JavaDoc, MalformedPatternException {
93
94     Perl5Compiler compiler=new Perl5Compiler();
95     List JavaDoc rules=new ArrayList JavaDoc();
96     try {
97       
98       LOG.info("loading " + filename);
99       // borrowed heavily from code in NutchConf.java
100
Document doc =
101         DocumentBuilderFactory.newInstance().newDocumentBuilder()
102         .parse(filename);
103       Element root = doc.getDocumentElement();
104       if (!"regex-normalize".equals(root.getTagName()))
105         LOG.severe("bad conf file: top-level element not <regex-normalize>");
106       NodeList regexes = root.getChildNodes();
107       for (int i = 0; i < regexes.getLength(); i++) {
108         Node regexNode = regexes.item(i);
109         if (!(regexNode instanceof Element))
110           continue;
111         Element regex = (Element)regexNode;
112         if (!"regex".equals(regex.getTagName()))
113           LOG.warning("bad conf file: element not <regex>");
114         NodeList fields = regex.getChildNodes();
115         String JavaDoc patternValue = null;
116         String JavaDoc subValue = null;
117         for (int j = 0; j < fields.getLength(); j++) {
118           Node fieldNode = fields.item(j);
119           if (!(fieldNode instanceof Element))
120             continue;
121           Element field = (Element)fieldNode;
122           if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
123             patternValue = ((Text)field.getFirstChild()).getData();
124           if ("substitution".equals(field.getTagName()) && field.hasChildNodes())
125             subValue = ((Text)field.getFirstChild()).getData();
126           if (!field.hasChildNodes())
127             subValue = "";
128         }
129         if (patternValue != null && subValue != null) {
130           Rule rule=new Rule();
131           rule.pattern=(Perl5Pattern) compiler.compile(patternValue);
132           rule.substitution=subValue;
133           rules.add(rule);
134         }
135       }
136         
137     } catch (Exception JavaDoc e) {
138       LOG.severe("error parsing " + filename +" conf file: " + e);
139     }
140     return rules;
141   }
142   
143   /** Spits out patterns and substitutions that are in the configuration file. */
144   public static void main(String JavaDoc args[])
145     throws MalformedPatternException, IOException JavaDoc {
146       RegexUrlNormalizer normalizer = new RegexUrlNormalizer();
147       Iterator JavaDoc i=normalizer.rules.iterator();
148       while(i.hasNext()) {
149         Rule r=(Rule) i.next();
150         System.out.print(r.pattern.getPattern() + " ");
151         System.out.println(r.substitution);
152       }
153     }
154   
155 }
156
Popular Tags