LinkStatusGenerator


1   /*
2    * Copyright 1999-2005 The Apache Software Foundation.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.apache.cocoon.generation;
17  
18  import org.apache.avalon.excalibur.pool.Recyclable;
19  import org.apache.avalon.framework.parameters.Parameters;
20  import org.apache.avalon.framework.configuration.Configurable;
21  import org.apache.avalon.framework.configuration.Configuration;
22  import org.apache.avalon.framework.configuration.ConfigurationException;
23  import org.apache.cocoon.ProcessingException;
24  import org.apache.cocoon.ResourceNotFoundException;
25  import org.apache.cocoon.environment.SourceResolver;
26  import org.apache.cocoon.Constants;
27  import org.apache.commons.lang.StringUtils;
28  import org.apache.regexp.RE;
29  import org.apache.regexp.RESyntaxException;
30  
31  import org.xml.sax.SAXException  ;
32  import org.xml.sax.helpers.AttributesImpl  ;
33  
34  import java.io.IOException  ;
35  import java.io.InputStream  ;
36  import java.io.BufferedReader  ;
37  import java.io.InputStreamReader  ;
38  import java.net.URLConnection  ;
39  import java.net.HttpURLConnection  ;
40  import java.net.URL  ;
41  import java.util.Map  ;
42  import java.util.HashSet  ;
43  import java.util.Iterator  ;
44  import java.util.List  ;
45  import java.util.ArrayList  ;
46  
47  /**
48   * @cocoon.sitemap.component.documentation
49   * Generates a list of links that are reachable from the src and their status.
50   *
51   * @cocoon.sitemap.component.name   linkstatus
52   * @cocoon.sitemap.component.label  content
53   * @cocoon.sitemap.component.logger sitemap.generator.linkstatus
54   *
55   * @author Michael Homeijer
56   * @author Nicola Ken Barozzi (nicolaken@apache.org)
57   * @author Bernhard Huber (huber@apache.org)
58   * @version $Id: LinkStatusGenerator.java 164808 2005-04-26 16:07:03Z vgritsenko $
59   */
60  public class LinkStatusGenerator extends ServiceableGenerator
61                                   implements Recyclable, Configurable {
62  
63      /** The URI of the namespace of this generator. */
64      protected static final String   URI =
65              "http://apache.org/cocoon/linkstatus/2.0";
66  
67      /** The namespace prefix for this namespace. */
68      protected static final String   PREFIX = "linkstatus";
69  
70      /* Node and attribute names */
71      protected static final String   TOP_NODE_NAME = "linkstatus";
72      protected static final String   LINK_NODE_NAME = "link";
73  
74      protected static final String   HREF_ATTR_NAME = "href";
75      protected static final String   REFERRER_ATTR_NAME = "referrer";
76      protected static final String   CONTENT_ATTR_NAME = "content";
77      protected static final String   STATUS_ATTR_NAME = "status";
78      protected static final String   MESSAGE_ATTR_NAME = "message";
79  
80      protected AttributesImpl   attributes;
81  
82      /**
83       * Config element name specifying expected link content-typ.
84       * <p>
85       *   Its value is <code>link-content-type</code>.
86       * </p>
87       *
88       * @since
89       */
90      public final static String   LINK_CONTENT_TYPE_CONFIG = "link-content-type";
91  
92      /**
93       * Default value of <code>link-content-type</code> configuration value.
94       * <p>
95       *   Its value is <code>application/x-cocoon-links</code>.
96       * </p>
97       *
98       * @since
99       */
100     public final String   LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
101 
102     /**
103      * Config element name specifying query-string appendend for requesting links
104      * of an URL.
105      * <p>
106      *  Its value is <code>link-view-query</code>.
107      * </p>
108      *
109      * @since
110      */
111     public final static String   LINK_VIEW_QUERY_CONFIG = "link-view-query";
112     /**
113      * Default value of <code>link-view-query</code> configuration value.
114      * <p>
115      *   Its value is <code>?cocoon-view=links</code>.
116      * </p>
117      *
118      * @since
119      */
120     public final static String   LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
121 
122     /**
123      * Config element name specifying excluding regular expression pattern.
124      * <p>
125      *  Its value is <code>exclude</code>.
126      * </p>
127      *
128      * @since
129      */
130     public final static String   EXCLUDE_CONFIG = "exclude";
131 
132     /**
133      * Config element name specifying including regular expression pattern.
134      * <p>
135      *  Its value is <code>include</code>.
136      * </p>
137      *
138      * @since
139      */
140     public final static String   INCLUDE_CONFIG = "include";
141 
142     /**
143      * Config element name specifying http header value for user-Agent.
144      * <p>
145      *  Its value is <code>user-agent</code>.
146      * </p>
147      *
148      * @since
149      */
150     public final static String   USER_AGENT_CONFIG = "user-agent";
151     /**
152      * Default value of <code>user-agent</code> configuration value.
153      *
154      * @see org.apache.cocoon.Constants#COMPLETE_NAME
155      * @since
156      */
157     public final static String   USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
158 
159     /**
160      * Config element name specifying http header value for accept.
161      * <p>
162      *  Its value is <code>accept</code>.
163      * </p>
164      *
165      * @since
166      */
167     public final static String   ACCEPT_CONFIG = "accept";
168     /**
169      * Default value of <code>accept</code> configuration value.
170      * <p>
171      *   Its value is <code>* / *</code>
172      * </p>
173      *
174      * @since
175      */
176     public final static String   ACCEPT_DEFAULT = "*/*";
177 
178     private String   linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
179     private String   linkContentType = LINK_CONTENT_TYPE_DEFAULT;
180     private HashSet   excludeCrawlingURL;
181     private HashSet   includeCrawlingURL;
182     // FIXME - The following two are never read, can we delete them?
183     //private String userAgent = USER_AGENT_DEFAULT;
184     //private String accept = ACCEPT_DEFAULT;
185 
186     private HashSet   crawled;
187     private HashSet   linksToProcess;
188 
189     /**
190      * Stores links to process and the referrer links
191      */
192     private static class Link {
193         private URL   url;
194         private String   referrer;
195 
196         public Link(URL   url, String   referrer) {
197             this.url = url;
198             this.referrer = referrer;
199         }
200 
201         public URL   getURL() {
202             return url;
203         }
204 
205         public String   getReferrer() {
206             return referrer;
207         }
208 
209         public boolean equals(Link l) {
210             return url.equals(l.getURL());
211         }
212     }
213 
214     /**
215      * Configure the crawler component.
216      * <p>
217      *  Configure can specify which URI to include, and which URI to exclude
218      *  from crawling. You specify the patterns as regular expressions.
219      * </p>
220      * <p>
221      *  Morover you can configure
222      *  the required content-type of crawling request, and the
223      *  query-string appended to each crawling request.
224      * </p>
225      * <pre><tt>
226      * &lt;include&gt;.*\.html?&lt;/include&gt; or &lt;include&gt;.*\.html?, .*\.xsp&lt;/include&gt;
227      * &lt;exclude&gt;.*\.gif&lt;/exclude&gt; or &lt;exclude&gt;.*\.gif, .*\.jpe?g&lt;/exclude&gt;
228      * &lt;link-content-type&gt; application/x-cocoon-links &lt;/link-content-type&gt;
229      * &lt;link-view-query&gt; ?cocoon-view=links &lt;/link-view-query&gt;
230      * &lt;user-agent&gt; Cocoon &lt;/user-agent&gt;
231      * &lt;accept&gt; text/xml &lt;/accept&gt;
232      * </tt></pre>
233      *
234      * @param  configuration               XML configuration of this avalon component.
235      * @exception  ConfigurationException  is throwing if configuration is invalid.
236      * @since
237      */
238     public void configure(Configuration configuration)
239             throws ConfigurationException {
240 
241         Configuration[] children;
242         children = configuration.getChildren(INCLUDE_CONFIG);
243         if (children.length > 0) {
244             includeCrawlingURL = new HashSet  ();
245             for (int i = 0; i < children.length; i++) {
246                 String   pattern = children[i].getValue();
247                 try {
248                     String   params[] = StringUtils.split(pattern, ", ");
249                     for (int index = 0; index < params.length; index++) {
250                         String   tokenized_pattern = params[index];
251                         this.includeCrawlingURL.add(new RE(tokenized_pattern));
252                     }
253                 } catch (RESyntaxException rese) {
254                     getLogger().error("Cannot create including regular-expression for " +
255                             pattern, rese);
256                 }
257             }
258         }
259 
260         children = configuration.getChildren(EXCLUDE_CONFIG);
261         if (children.length > 0) {
262             excludeCrawlingURL = new HashSet  ();
263             for (int i = 0; i < children.length; i++) {
264                 String   pattern = children[i].getValue();
265                 try {
266                     String   params[] = StringUtils.split(pattern, ", ");
267                     for (int index = 0; index < params.length; index++) {
268                         String   tokenized_pattern = params[index];
269                         this.excludeCrawlingURL.add(new RE(tokenized_pattern));
270                     }
271                 } catch (RESyntaxException rese) {
272                     getLogger().error("Cannot create excluding regular-expression for " +
273                             pattern, rese);
274                 }
275             }
276         } else {
277             excludeCrawlingURL = new HashSet  ();
278             setDefaultExcludeFromCrawling();
279         }
280 
281         Configuration child;
282         String   value;
283         child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
284         if (child != null) {
285             value = child.getValue();
286             if (value != null && value.length() > 0) {
287                 this.linkContentType = value.trim();
288             }
289         }
290         child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
291         if (child != null) {
292             value = child.getValue();
293             if (value != null && value.length() > 0) {
294                 this.linkViewQuery = value.trim();
295             }
296         }
297 /*      FIXME: Also delete this if you delete the fields above.
298         child = configuration.getChild(USER_AGENT_CONFIG, false);
299         if (child != null) {
300             value = child.getValue();
301             if (value != null && value.length() > 0) {
302                 this.userAgent = value;
303             }
304         }
305 
306         child = configuration.getChild(ACCEPT_CONFIG, false);
307         if (child != null) {
308             value = child.getValue();
309             if (value != null && value.length() > 0) {
310                 this.accept = value;
311             }
312         }
313 */
314     }
315 
316     public void setup(SourceResolver resolver, Map   objectModel, String   src, Parameters par)
317     throws ProcessingException, SAXException  , IOException   {
318 
319         super.setup(resolver, objectModel, src, par);
320 
321         /* Create a reusable attributes for creating nodes */
322         this.attributes = new AttributesImpl  ();
323 
324         // already done in configure...
325         //excludeCrawlingURL = new HashSet();
326         //this.setDefaultExcludeFromCrawling();
327     }
328 
329     /**
330      * Generate XML data.
331      *
332      * @throws  SAXException
333      *      if an error occurs while outputting the document
334      * @throws  ProcessingException
335      *      if the requsted URI wasn't found
336      */
337     public void generate()
338     throws SAXException  , ProcessingException {
339         try {
340 
341             crawled = new HashSet  ();
342             linksToProcess = new HashSet  ();
343 
344             URL   root = new URL  (source);
345             linksToProcess.add(new Link(root, ""));
346 
347 
348             if (getLogger().isDebugEnabled()) {
349                 getLogger().debug("crawl URL " + root);
350             }
351 
352             this.contentHandler.startDocument();
353             this.contentHandler.startPrefixMapping(PREFIX, URI);
354 
355             attributes.clear();
356             super.contentHandler.startElement(URI, TOP_NODE_NAME, PREFIX + ':' + TOP_NODE_NAME, attributes);
357 
358             while (linksToProcess.size() > 0) {
359                 Iterator   i = linksToProcess.iterator();
360 
361                 if (i.hasNext()) {
362                     // fetch a URL
363                     Link link = (Link) i.next();
364                     URL   url = link.getURL();
365 
366                     // remove it from the to-do list
367                     linksToProcess.remove(link);
368 
369                     String   new_url_link = processURL(url, link.getReferrer());
370 
371                     // calc all links from this url
372                     if (new_url_link != null) {
373 
374                         List   url_links = getLinksFromConnection(new_url_link, url);
375                         if (url_links != null) {
376                             // add links of this url to the to-do list
377                             linksToProcess.addAll(url_links);
378                         }
379                     }
380                 }
381             }
382 
383             super.contentHandler.endElement(URI, TOP_NODE_NAME, PREFIX + ':' + TOP_NODE_NAME);
384             this.contentHandler.endPrefixMapping(PREFIX);
385             this.contentHandler.endDocument();
386         } catch (IOException   ioe) {
387             getLogger().warn("Could not read source ", ioe);
388             throw new ResourceNotFoundException("Could not read source ", ioe);
389         }
390     }
391 
392     /**
393      * Default exclude patterns.
394      * <p>
395      *   By default URLs matching following patterns are excluded:
396      * </p>
397      * <ul>
398      *   <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
399      *   <li>.*\\.png(\\?.*)?$ - exclude png images</li>
400      *   <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
401      *   <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
402      *   <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
403      * </ul>
404      *
405      * @since
406      */
407     private void setDefaultExcludeFromCrawling() {
408         String  [] EXCLUDE_FROM_CRAWLING_DEFAULT = {
409             ".*\\.gif(\\?.*)?$",
410             ".*\\.png(\\?.*)?$",
411             ".*\\.jpe?g(\\?.*)?$",
412             ".*\\.js(\\?.*)?$",
413             ".*\\.css(\\?.*)?$"
414         };
415 
416         for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
417             String   pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
418             try {
419                 excludeCrawlingURL.add(new RE(pattern));
420             } catch (RESyntaxException rese) {
421                 getLogger().error("Cannot create excluding regular-expression for " +
422                         pattern, rese);
423             }
424         }
425     }
426 
427 
428     /**
429      * Retrieve a list of links of a url
430      *
431      * @param url_link_string url for requesting links, it is assumed that
432      *   url_link_string queries the cocoon view links, ie of the form
433      *   <code>http://host/foo/bar?cocoon-view=links</code>
434      * @param url_of_referrer base url of which links are requested, ie of the form
435      *   <code>http://host/foo/bar</code>
436      * @return List of links from url_of_referrer, as result of requesting url
437      *   url_link_string
438      */
439     protected List   getLinksFromConnection(String   url_link_string, URL   url_of_referrer) {
440         List   url_links = null;
441         BufferedReader   br = null;
442         try {
443             URL   url_link = new URL  (url_link_string);
444             URLConnection   conn = url_link.openConnection();
445             String   content_type = conn.getContentType();
446 
447             if (content_type == null) {
448                 getLogger().warn("No content type available for " + String.valueOf(url_link_string));
449                 // caller checks if null
450                 return url_links;
451             }
452 
453             if (getLogger().isDebugEnabled()) {
454                 getLogger().debug("Content-type: " + content_type);
455             }
456 
457             if (content_type.equals(linkContentType) ||
458                 content_type.startsWith(linkContentType + ";")) {
459                 url_links = new ArrayList  ();
460 
461                 InputStream   is = conn.getInputStream();
462                 br = new BufferedReader  (new InputStreamReader  (is));
463 
464                 // content is supposed to be a list of links,
465                 // relative to current URL
466                 String   line;
467                 String   referrer = url_of_referrer.toString();
468 
469                 while ((line = br.readLine()) != null) {
470                     URL   new_url = new URL  (url_link, line);
471                     boolean add_url = true;
472                     // don't add new_url twice
473                     if (add_url) {
474                         add_url &= !url_links.contains(new_url);
475                     }
476 
477                     // don't add new_url if it has been crawled already
478                     if (add_url) {
479                         add_url &= !crawled.contains(new_url.toString());
480                     }
481 
482                     Link new_link = new Link(new_url, referrer);
483                     if (add_url) {
484                         add_url &= !linksToProcess.contains(new_link);
485                     }
486 
487                     // don't add if is not matched by existing include definition
488                     if (add_url) {
489                         add_url &= isIncludedURL(new_url.toString());
490                     }
491 
492                     if (add_url) {
493                         if (getLogger().isDebugEnabled()) {
494                             getLogger().debug("Add URL: " + new_url.toString());
495                         }
496                         url_links.add(new_link);
497                     }
498                 }
499                 // now we have a list of URL which should be examined
500             }
501         } catch (IOException   ioe) {
502             getLogger().warn("Problems get links of " + url_link_string, ioe);
503         } finally {
504             // explictly close the stream
505             if (br != null) {
506                 try {
507                     br.close();
508                     br = null;
509                 } catch (IOException   ignored) {
510                 }
511             }
512         }
513         return url_links;
514     }
515 
516     /**
517      * Generate xml attributes of a url, calculate url for retrieving links
518      *
519      * @param url to process
520      * @param referrer of the url
521      * @return String url for retrieving links, or null if url is an excluded-url,
522      *   and not an included-url.
523      */
524     protected String   processURL(URL   url, String   referrer) throws SAXException   {
525 
526         if (getLogger().isDebugEnabled()) {
527             getLogger().debug("getLinks URL " + url);
528         }
529 
530         String   result = null;
531 
532         // don't try to investigate a url which has been crawled already
533         if (crawled.contains(url.toString())) {
534             return null;
535         }
536 
537         // mark it as crawled
538         crawled.add(url.toString());
539 
540         attributes.clear();
541         attributes.addAttribute("", HREF_ATTR_NAME,
542                 HREF_ATTR_NAME, "CDATA", url.toString());
543         attributes.addAttribute("", REFERRER_ATTR_NAME,
544                 REFERRER_ATTR_NAME, "CDATA", referrer);
545 
546         // Output url, referrer, content-type, status, message for traversable url's
547         HttpURLConnection   h = null;
548         try {
549 
550             URLConnection   links_url_connection = url.openConnection();
551             h = (HttpURLConnection  ) links_url_connection;
552             String   content_type = links_url_connection.getContentType();
553 
554             attributes.addAttribute("", CONTENT_ATTR_NAME,
555                     CONTENT_ATTR_NAME, "CDATA",
556                     content_type);
557 
558             attributes.addAttribute("", MESSAGE_ATTR_NAME,
559                     MESSAGE_ATTR_NAME, "CDATA",
560                     h.getResponseMessage());
561 
562             attributes.addAttribute("", STATUS_ATTR_NAME,
563                     STATUS_ATTR_NAME, "CDATA",
564                     String.valueOf(h.getResponseCode()));
565         } catch (IOException   ioe) {
566             attributes.addAttribute("", MESSAGE_ATTR_NAME,
567                     MESSAGE_ATTR_NAME, "CDATA",
568                     ioe.getMessage());
569         } finally {
570             if (h != null) {
571                 h.disconnect();
572             }
573         }
574 
575         // don't try to get links of a url which is excluded from crawling
576         // try to get links of a url which is included for crawling
577         if (!isExcludedURL(url.toString()) && isIncludedURL(url.toString())) {
578             // add prefix and query to get data from the linkserializer.
579             result = url.toExternalForm()
580                     + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&")
581                     + linkViewQuery;
582         }
583 
584         super.contentHandler.startElement(URI, LINK_NODE_NAME, PREFIX + ':' + LINK_NODE_NAME, attributes);
585         super.contentHandler.endElement(URI, LINK_NODE_NAME, PREFIX + ':' + LINK_NODE_NAME);
586 
587         return result;
588     }
589 
590     /**
591      * check if URL is a candidate for indexing
592      *
593      * @param  url  Description of Parameter
594      * @return      The excludedURL value
595      * @since
596      */
597     private boolean isExcludedURL(String   url) {
598         // by default include URL for crawling
599         if (excludeCrawlingURL == null) {
600             if (getLogger().isDebugEnabled()) {
601                 getLogger().debug("exclude no URL " + url);
602             }
603             return false;
604         }
605 
606         final String   s = url;
607         Iterator   i = excludeCrawlingURL.iterator();
608         while (i.hasNext()) {
609             RE pattern = (RE) i.next();
610             if (pattern.match(s)) {
611                 if (getLogger().isDebugEnabled()) {
612                     getLogger().debug("exclude URL " + url);
613                 }
614                 return true;
615             }
616         }
617         if (getLogger().isDebugEnabled()) {
618             getLogger().debug("exclude not URL " + url);
619         }
620         return false;
621     }
622 
623 
624     /**
625      * check if URL is a candidate for indexing
626      *
627      * @param  url  Description of Parameter
628      * @return      The includedURL value
629      * @since
630      */
631     private boolean isIncludedURL(String   url) {
632         // by default include URL for crawling
633         if (includeCrawlingURL == null) {
634             if (getLogger().isDebugEnabled()) {
635                 getLogger().debug("include all URL " + url);
636             }
637             return true;
638         }
639 
640         final String   s = url;
641         Iterator   i = includeCrawlingURL.iterator();
642         while (i.hasNext()) {
643             RE pattern = (RE) i.next();
644             if (pattern.match(s)) {
645                 if (getLogger().isDebugEnabled()) {
646                     getLogger().debug("include URL " + url);
647                 }
648                 return true;
649             }
650         }
651         if (getLogger().isDebugEnabled()) {
652             getLogger().debug("include not URL " + url);
653         }
654         return false;
655     }
656 
657     public void recycle() {
658         super.recycle();
659 
660         this.attributes = null;
661     }
662 }
663
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags