KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > cocoon > generation > LinkStatusGenerator


1 /*
2  * Copyright 1999-2005 The Apache Software Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.apache.cocoon.generation;
17
18 import org.apache.avalon.excalibur.pool.Recyclable;
19 import org.apache.avalon.framework.parameters.Parameters;
20 import org.apache.avalon.framework.configuration.Configurable;
21 import org.apache.avalon.framework.configuration.Configuration;
22 import org.apache.avalon.framework.configuration.ConfigurationException;
23 import org.apache.cocoon.ProcessingException;
24 import org.apache.cocoon.ResourceNotFoundException;
25 import org.apache.cocoon.environment.SourceResolver;
26 import org.apache.cocoon.Constants;
27 import org.apache.commons.lang.StringUtils;
28 import org.apache.regexp.RE;
29 import org.apache.regexp.RESyntaxException;
30
31 import org.xml.sax.SAXException JavaDoc;
32 import org.xml.sax.helpers.AttributesImpl JavaDoc;
33
34 import java.io.IOException JavaDoc;
35 import java.io.InputStream JavaDoc;
36 import java.io.BufferedReader JavaDoc;
37 import java.io.InputStreamReader JavaDoc;
38 import java.net.URLConnection JavaDoc;
39 import java.net.HttpURLConnection JavaDoc;
40 import java.net.URL JavaDoc;
41 import java.util.Map JavaDoc;
42 import java.util.HashSet JavaDoc;
43 import java.util.Iterator JavaDoc;
44 import java.util.List JavaDoc;
45 import java.util.ArrayList JavaDoc;
46
47 /**
48  * @cocoon.sitemap.component.documentation
49  * Generates a list of links that are reachable from the src and their status.
50  *
51  * @cocoon.sitemap.component.name linkstatus
52  * @cocoon.sitemap.component.label content
53  * @cocoon.sitemap.component.logger sitemap.generator.linkstatus
54  *
55  * @author Michael Homeijer
56  * @author Nicola Ken Barozzi (nicolaken@apache.org)
57  * @author Bernhard Huber (huber@apache.org)
58  * @version $Id: LinkStatusGenerator.java 164808 2005-04-26 16:07:03Z vgritsenko $
59  */

60 public class LinkStatusGenerator extends ServiceableGenerator
61                                  implements Recyclable, Configurable {
62
63     /** The URI of the namespace of this generator. */
64     protected static final String JavaDoc URI =
65             "http://apache.org/cocoon/linkstatus/2.0";
66
67     /** The namespace prefix for this namespace. */
68     protected static final String JavaDoc PREFIX = "linkstatus";
69
70     /* Node and attribute names */
71     protected static final String JavaDoc TOP_NODE_NAME = "linkstatus";
72     protected static final String JavaDoc LINK_NODE_NAME = "link";
73
74     protected static final String JavaDoc HREF_ATTR_NAME = "href";
75     protected static final String JavaDoc REFERRER_ATTR_NAME = "referrer";
76     protected static final String JavaDoc CONTENT_ATTR_NAME = "content";
77     protected static final String JavaDoc STATUS_ATTR_NAME = "status";
78     protected static final String JavaDoc MESSAGE_ATTR_NAME = "message";
79
80     protected AttributesImpl JavaDoc attributes;
81
82     /**
83      * Config element name specifying expected link content-typ.
84      * <p>
85      * Its value is <code>link-content-type</code>.
86      * </p>
87      *
88      * @since
89      */

90     public final static String JavaDoc LINK_CONTENT_TYPE_CONFIG = "link-content-type";
91
92     /**
93      * Default value of <code>link-content-type</code> configuration value.
94      * <p>
95      * Its value is <code>application/x-cocoon-links</code>.
96      * </p>
97      *
98      * @since
99      */

100     public final String JavaDoc LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
101
102     /**
103      * Config element name specifying query-string appendend for requesting links
104      * of an URL.
105      * <p>
106      * Its value is <code>link-view-query</code>.
107      * </p>
108      *
109      * @since
110      */

111     public final static String JavaDoc LINK_VIEW_QUERY_CONFIG = "link-view-query";
112     /**
113      * Default value of <code>link-view-query</code> configuration value.
114      * <p>
115      * Its value is <code>?cocoon-view=links</code>.
116      * </p>
117      *
118      * @since
119      */

120     public final static String JavaDoc LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
121
122     /**
123      * Config element name specifying excluding regular expression pattern.
124      * <p>
125      * Its value is <code>exclude</code>.
126      * </p>
127      *
128      * @since
129      */

130     public final static String JavaDoc EXCLUDE_CONFIG = "exclude";
131
132     /**
133      * Config element name specifying including regular expression pattern.
134      * <p>
135      * Its value is <code>include</code>.
136      * </p>
137      *
138      * @since
139      */

140     public final static String JavaDoc INCLUDE_CONFIG = "include";
141
142     /**
143      * Config element name specifying http header value for user-Agent.
144      * <p>
145      * Its value is <code>user-agent</code>.
146      * </p>
147      *
148      * @since
149      */

150     public final static String JavaDoc USER_AGENT_CONFIG = "user-agent";
151     /**
152      * Default value of <code>user-agent</code> configuration value.
153      *
154      * @see org.apache.cocoon.Constants#COMPLETE_NAME
155      * @since
156      */

157     public final static String JavaDoc USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
158
159     /**
160      * Config element name specifying http header value for accept.
161      * <p>
162      * Its value is <code>accept</code>.
163      * </p>
164      *
165      * @since
166      */

167     public final static String JavaDoc ACCEPT_CONFIG = "accept";
168     /**
169      * Default value of <code>accept</code> configuration value.
170      * <p>
171      * Its value is <code>* / *</code>
172      * </p>
173      *
174      * @since
175      */

176     public final static String JavaDoc ACCEPT_DEFAULT = "*/*";
177
178     private String JavaDoc linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
179     private String JavaDoc linkContentType = LINK_CONTENT_TYPE_DEFAULT;
180     private HashSet JavaDoc excludeCrawlingURL;
181     private HashSet JavaDoc includeCrawlingURL;
182     // FIXME - The following two are never read, can we delete them?
183
//private String userAgent = USER_AGENT_DEFAULT;
184
//private String accept = ACCEPT_DEFAULT;
185

186     private HashSet JavaDoc crawled;
187     private HashSet JavaDoc linksToProcess;
188
189     /**
190      * Stores links to process and the referrer links
191      */

192     private static class Link {
193         private URL JavaDoc url;
194         private String JavaDoc referrer;
195
196         public Link(URL JavaDoc url, String JavaDoc referrer) {
197             this.url = url;
198             this.referrer = referrer;
199         }
200
201         public URL JavaDoc getURL() {
202             return url;
203         }
204
205         public String JavaDoc getReferrer() {
206             return referrer;
207         }
208
209         public boolean equals(Link l) {
210             return url.equals(l.getURL());
211         }
212     }
213
214     /**
215      * Configure the crawler component.
216      * <p>
217      * Configure can specify which URI to include, and which URI to exclude
218      * from crawling. You specify the patterns as regular expressions.
219      * </p>
220      * <p>
221      * Morover you can configure
222      * the required content-type of crawling request, and the
223      * query-string appended to each crawling request.
224      * </p>
225      * <pre><tt>
226      * &lt;include&gt;.*\.html?&lt;/include&gt; or &lt;include&gt;.*\.html?, .*\.xsp&lt;/include&gt;
227      * &lt;exclude&gt;.*\.gif&lt;/exclude&gt; or &lt;exclude&gt;.*\.gif, .*\.jpe?g&lt;/exclude&gt;
228      * &lt;link-content-type&gt; application/x-cocoon-links &lt;/link-content-type&gt;
229      * &lt;link-view-query&gt; ?cocoon-view=links &lt;/link-view-query&gt;
230      * &lt;user-agent&gt; Cocoon &lt;/user-agent&gt;
231      * &lt;accept&gt; text/xml &lt;/accept&gt;
232      * </tt></pre>
233      *
234      * @param configuration XML configuration of this avalon component.
235      * @exception ConfigurationException is throwing if configuration is invalid.
236      * @since
237      */

238     public void configure(Configuration configuration)
239             throws ConfigurationException {
240
241         Configuration[] children;
242         children = configuration.getChildren(INCLUDE_CONFIG);
243         if (children.length > 0) {
244             includeCrawlingURL = new HashSet JavaDoc();
245             for (int i = 0; i < children.length; i++) {
246                 String JavaDoc pattern = children[i].getValue();
247                 try {
248                     String JavaDoc params[] = StringUtils.split(pattern, ", ");
249                     for (int index = 0; index < params.length; index++) {
250                         String JavaDoc tokenized_pattern = params[index];
251                         this.includeCrawlingURL.add(new RE(tokenized_pattern));
252                     }
253                 } catch (RESyntaxException rese) {
254                     getLogger().error("Cannot create including regular-expression for " +
255                             pattern, rese);
256                 }
257             }
258         }
259
260         children = configuration.getChildren(EXCLUDE_CONFIG);
261         if (children.length > 0) {
262             excludeCrawlingURL = new HashSet JavaDoc();
263             for (int i = 0; i < children.length; i++) {
264                 String JavaDoc pattern = children[i].getValue();
265                 try {
266                     String JavaDoc params[] = StringUtils.split(pattern, ", ");
267                     for (int index = 0; index < params.length; index++) {
268                         String JavaDoc tokenized_pattern = params[index];
269                         this.excludeCrawlingURL.add(new RE(tokenized_pattern));
270                     }
271                 } catch (RESyntaxException rese) {
272                     getLogger().error("Cannot create excluding regular-expression for " +
273                             pattern, rese);
274                 }
275             }
276         } else {
277             excludeCrawlingURL = new HashSet JavaDoc();
278             setDefaultExcludeFromCrawling();
279         }
280
281         Configuration child;
282         String JavaDoc value;
283         child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
284         if (child != null) {
285             value = child.getValue();
286             if (value != null && value.length() > 0) {
287                 this.linkContentType = value.trim();
288             }
289         }
290         child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
291         if (child != null) {
292             value = child.getValue();
293             if (value != null && value.length() > 0) {
294                 this.linkViewQuery = value.trim();
295             }
296         }
297 /* FIXME: Also delete this if you delete the fields above.
298         child = configuration.getChild(USER_AGENT_CONFIG, false);
299         if (child != null) {
300             value = child.getValue();
301             if (value != null && value.length() > 0) {
302                 this.userAgent = value;
303             }
304         }
305
306         child = configuration.getChild(ACCEPT_CONFIG, false);
307         if (child != null) {
308             value = child.getValue();
309             if (value != null && value.length() > 0) {
310                 this.accept = value;
311             }
312         }
313 */

314     }
315
316     public void setup(SourceResolver resolver, Map JavaDoc objectModel, String JavaDoc src, Parameters par)
317     throws ProcessingException, SAXException JavaDoc, IOException JavaDoc {
318
319         super.setup(resolver, objectModel, src, par);
320
321         /* Create a reusable attributes for creating nodes */
322         this.attributes = new AttributesImpl JavaDoc();
323
324         // already done in configure...
325
//excludeCrawlingURL = new HashSet();
326
//this.setDefaultExcludeFromCrawling();
327
}
328
329     /**
330      * Generate XML data.
331      *
332      * @throws SAXException
333      * if an error occurs while outputting the document
334      * @throws ProcessingException
335      * if the requsted URI wasn't found
336      */

337     public void generate()
338     throws SAXException JavaDoc, ProcessingException {
339         try {
340
341             crawled = new HashSet JavaDoc();
342             linksToProcess = new HashSet JavaDoc();
343
344             URL JavaDoc root = new URL JavaDoc(source);
345             linksToProcess.add(new Link(root, ""));
346
347
348             if (getLogger().isDebugEnabled()) {
349                 getLogger().debug("crawl URL " + root);
350             }
351
352             this.contentHandler.startDocument();
353             this.contentHandler.startPrefixMapping(PREFIX, URI);
354
355             attributes.clear();
356             super.contentHandler.startElement(URI, TOP_NODE_NAME, PREFIX + ':' + TOP_NODE_NAME, attributes);
357
358             while (linksToProcess.size() > 0) {
359                 Iterator JavaDoc i = linksToProcess.iterator();
360
361                 if (i.hasNext()) {
362                     // fetch a URL
363
Link link = (Link) i.next();
364                     URL JavaDoc url = link.getURL();
365
366                     // remove it from the to-do list
367
linksToProcess.remove(link);
368
369                     String JavaDoc new_url_link = processURL(url, link.getReferrer());
370
371                     // calc all links from this url
372
if (new_url_link != null) {
373
374                         List JavaDoc url_links = getLinksFromConnection(new_url_link, url);
375                         if (url_links != null) {
376                             // add links of this url to the to-do list
377
linksToProcess.addAll(url_links);
378                         }
379                     }
380                 }
381             }
382
383             super.contentHandler.endElement(URI, TOP_NODE_NAME, PREFIX + ':' + TOP_NODE_NAME);
384             this.contentHandler.endPrefixMapping(PREFIX);
385             this.contentHandler.endDocument();
386         } catch (IOException JavaDoc ioe) {
387             getLogger().warn("Could not read source ", ioe);
388             throw new ResourceNotFoundException("Could not read source ", ioe);
389         }
390     }
391
392     /**
393      * Default exclude patterns.
394      * <p>
395      * By default URLs matching following patterns are excluded:
396      * </p>
397      * <ul>
398      * <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
399      * <li>.*\\.png(\\?.*)?$ - exclude png images</li>
400      * <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
401      * <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
402      * <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
403      * </ul>
404      *
405      * @since
406      */

407     private void setDefaultExcludeFromCrawling() {
408         String JavaDoc[] EXCLUDE_FROM_CRAWLING_DEFAULT = {
409             ".*\\.gif(\\?.*)?$",
410             ".*\\.png(\\?.*)?$",
411             ".*\\.jpe?g(\\?.*)?$",
412             ".*\\.js(\\?.*)?$",
413             ".*\\.css(\\?.*)?$"
414         };
415
416         for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
417             String JavaDoc pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
418             try {
419                 excludeCrawlingURL.add(new RE(pattern));
420             } catch (RESyntaxException rese) {
421                 getLogger().error("Cannot create excluding regular-expression for " +
422                         pattern, rese);
423             }
424         }
425     }
426
427
428     /**
429      * Retrieve a list of links of a url
430      *
431      * @param url_link_string url for requesting links, it is assumed that
432      * url_link_string queries the cocoon view links, ie of the form
433      * <code>http://host/foo/bar?cocoon-view=links</code>
434      * @param url_of_referrer base url of which links are requested, ie of the form
435      * <code>http://host/foo/bar</code>
436      * @return List of links from url_of_referrer, as result of requesting url
437      * url_link_string
438      */

439     protected List JavaDoc getLinksFromConnection(String JavaDoc url_link_string, URL JavaDoc url_of_referrer) {
440         List JavaDoc url_links = null;
441         BufferedReader JavaDoc br = null;
442         try {
443             URL JavaDoc url_link = new URL JavaDoc(url_link_string);
444             URLConnection JavaDoc conn = url_link.openConnection();
445             String JavaDoc content_type = conn.getContentType();
446
447             if (content_type == null) {
448                 getLogger().warn("No content type available for " + String.valueOf(url_link_string));
449                 // caller checks if null
450
return url_links;
451             }
452
453             if (getLogger().isDebugEnabled()) {
454                 getLogger().debug("Content-type: " + content_type);
455             }
456
457             if (content_type.equals(linkContentType) ||
458                 content_type.startsWith(linkContentType + ";")) {
459                 url_links = new ArrayList JavaDoc();
460
461                 InputStream JavaDoc is = conn.getInputStream();
462                 br = new BufferedReader JavaDoc(new InputStreamReader JavaDoc(is));
463
464                 // content is supposed to be a list of links,
465
// relative to current URL
466
String JavaDoc line;
467                 String JavaDoc referrer = url_of_referrer.toString();
468
469                 while ((line = br.readLine()) != null) {
470                     URL JavaDoc new_url = new URL JavaDoc(url_link, line);
471                     boolean add_url = true;
472                     // don't add new_url twice
473
if (add_url) {
474                         add_url &= !url_links.contains(new_url);
475                     }
476
477                     // don't add new_url if it has been crawled already
478
if (add_url) {
479                         add_url &= !crawled.contains(new_url.toString());
480                     }
481
482                     Link new_link = new Link(new_url, referrer);
483                     if (add_url) {
484                         add_url &= !linksToProcess.contains(new_link);
485                     }
486
487                     // don't add if is not matched by existing include definition
488
if (add_url) {
489                         add_url &= isIncludedURL(new_url.toString());
490                     }
491
492                     if (add_url) {
493                         if (getLogger().isDebugEnabled()) {
494                             getLogger().debug("Add URL: " + new_url.toString());
495                         }
496                         url_links.add(new_link);
497                     }
498                 }
499                 // now we have a list of URL which should be examined
500
}
501         } catch (IOException JavaDoc ioe) {
502             getLogger().warn("Problems get links of " + url_link_string, ioe);
503         } finally {
504             // explictly close the stream
505
if (br != null) {
506                 try {
507                     br.close();
508                     br = null;
509                 } catch (IOException JavaDoc ignored) {
510                 }
511             }
512         }
513         return url_links;
514     }
515
516     /**
517      * Generate xml attributes of a url, calculate url for retrieving links
518      *
519      * @param url to process
520      * @param referrer of the url
521      * @return String url for retrieving links, or null if url is an excluded-url,
522      * and not an included-url.
523      */

524     protected String JavaDoc processURL(URL JavaDoc url, String JavaDoc referrer) throws SAXException JavaDoc {
525
526         if (getLogger().isDebugEnabled()) {
527             getLogger().debug("getLinks URL " + url);
528         }
529
530         String JavaDoc result = null;
531
532         // don't try to investigate a url which has been crawled already
533
if (crawled.contains(url.toString())) {
534             return null;
535         }
536
537         // mark it as crawled
538
crawled.add(url.toString());
539
540         attributes.clear();
541         attributes.addAttribute("", HREF_ATTR_NAME,
542                 HREF_ATTR_NAME, "CDATA", url.toString());
543         attributes.addAttribute("", REFERRER_ATTR_NAME,
544                 REFERRER_ATTR_NAME, "CDATA", referrer);
545
546         // Output url, referrer, content-type, status, message for traversable url's
547
HttpURLConnection JavaDoc h = null;
548         try {
549
550             URLConnection JavaDoc links_url_connection = url.openConnection();
551             h = (HttpURLConnection JavaDoc) links_url_connection;
552             String JavaDoc content_type = links_url_connection.getContentType();
553
554             attributes.addAttribute("", CONTENT_ATTR_NAME,
555                     CONTENT_ATTR_NAME, "CDATA",
556                     content_type);
557
558             attributes.addAttribute("", MESSAGE_ATTR_NAME,
559                     MESSAGE_ATTR_NAME, "CDATA",
560                     h.getResponseMessage());
561
562             attributes.addAttribute("", STATUS_ATTR_NAME,
563                     STATUS_ATTR_NAME, "CDATA",
564                     String.valueOf(h.getResponseCode()));
565         } catch (IOException JavaDoc ioe) {
566             attributes.addAttribute("", MESSAGE_ATTR_NAME,
567                     MESSAGE_ATTR_NAME, "CDATA",
568                     ioe.getMessage());
569         } finally {
570             if (h != null) {
571                 h.disconnect();
572             }
573         }
574
575         // don't try to get links of a url which is excluded from crawling
576
// try to get links of a url which is included for crawling
577
if (!isExcludedURL(url.toString()) && isIncludedURL(url.toString())) {
578             // add prefix and query to get data from the linkserializer.
579
result = url.toExternalForm()
580                     + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&")
581                     + linkViewQuery;
582         }
583
584         super.contentHandler.startElement(URI, LINK_NODE_NAME, PREFIX + ':' + LINK_NODE_NAME, attributes);
585         super.contentHandler.endElement(URI, LINK_NODE_NAME, PREFIX + ':' + LINK_NODE_NAME);
586
587         return result;
588     }
589
590     /**
591      * check if URL is a candidate for indexing
592      *
593      * @param url Description of Parameter
594      * @return The excludedURL value
595      * @since
596      */

597     private boolean isExcludedURL(String JavaDoc url) {
598         // by default include URL for crawling
599
if (excludeCrawlingURL == null) {
600             if (getLogger().isDebugEnabled()) {
601                 getLogger().debug("exclude no URL " + url);
602             }
603             return false;
604         }
605
606         final String JavaDoc s = url;
607         Iterator JavaDoc i = excludeCrawlingURL.iterator();
608         while (i.hasNext()) {
609             RE pattern = (RE) i.next();
610             if (pattern.match(s)) {
611                 if (getLogger().isDebugEnabled()) {
612                     getLogger().debug("exclude URL " + url);
613                 }
614                 return true;
615             }
616         }
617         if (getLogger().isDebugEnabled()) {
618             getLogger().debug("exclude not URL " + url);
619         }
620         return false;
621     }
622
623
624     /**
625      * check if URL is a candidate for indexing
626      *
627      * @param url Description of Parameter
628      * @return The includedURL value
629      * @since
630      */

631     private boolean isIncludedURL(String JavaDoc url) {
632         // by default include URL for crawling
633
if (includeCrawlingURL == null) {
634             if (getLogger().isDebugEnabled()) {
635                 getLogger().debug("include all URL " + url);
636             }
637             return true;
638         }
639
640         final String JavaDoc s = url;
641         Iterator JavaDoc i = includeCrawlingURL.iterator();
642         while (i.hasNext()) {
643             RE pattern = (RE) i.next();
644             if (pattern.match(s)) {
645                 if (getLogger().isDebugEnabled()) {
646                     getLogger().debug("include URL " + url);
647                 }
648                 return true;
649             }
650         }
651         if (getLogger().isDebugEnabled()) {
652             getLogger().debug("include not URL " + url);
653         }
654         return false;
655     }
656
657     public void recycle() {
658         super.recycle();
659
660         this.attributes = null;
661     }
662 }
663
Popular Tags