KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > sun > org > apache > xerces > internal > xinclude > XIncludeTextReader


1 /*
2  * The Apache Software License, Version 1.1
3  *
4  *
5  * Copyright (c) 2001-2004 The Apache Software Foundation. All rights
6  * reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in
17  * the documentation and/or other materials provided with the
18  * distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  * if any, must include the following acknowledgment:
22  * "This product includes software developed by the
23  * Apache Software Foundation (http://www.apache.org/)."
24  * Alternately, this acknowledgment may appear in the software itself,
25  * if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Xerces" and "Apache Software Foundation" must
28  * not be used to endorse or promote products derived from this
29  * software without prior written permission. For written
30  * permission, please contact apache@apache.org.
31  *
32  * 5. Products derived from this software may not be called "Apache",
33  * nor may "Apache" appear in their name, without prior written
34  * permission of the Apache Software Foundation.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47  * SUCH DAMAGE.
48  * ====================================================================
49  *
50  * This software consists of voluntary contributions made by many
51  * individuals on behalf of the Apache Software Foundation and was
52  * originally based on software copyright (c) 2003, International
53  * Business Machines, Inc., http://www.apache.org. For more
54  * information on the Apache Software Foundation, please see
55  * <http://www.apache.org/>.
56  */

57
58 package com.sun.org.apache.xerces.internal.xinclude;
59
60 import java.io.BufferedInputStream JavaDoc;
61 import java.io.IOException JavaDoc;
62 import java.io.InputStream JavaDoc;
63 import java.io.InputStreamReader JavaDoc;
64 import java.io.Reader JavaDoc;
65 import java.net.HttpURLConnection JavaDoc;
66 import java.net.URL JavaDoc;
67 import java.net.URLConnection JavaDoc;
68 import java.util.Locale JavaDoc;
69
70 import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
71 import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
72 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
73 import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
74 import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
75 import com.sun.org.apache.xerces.internal.util.EncodingMap;
76 import com.sun.org.apache.xerces.internal.util.MessageFormatter;
77 import com.sun.org.apache.xerces.internal.util.XMLChar;
78 import com.sun.org.apache.xerces.internal.util.XMLStringBuffer;
79 import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;
80
81 /**
82  * This class is used for reading resources requested in &lt;include&gt; elements,
83  * when the parse attribute of the &lt;include&gt; element is "text". Using this
84  * class will open the location, detect the encoding, and discard the byte order
85  * mark, if applicable.
86  *
87  * REVISIT:
88  * Much of the code in this class is taken from XMLEntityManager. It would be nice
89  * if this code could be shared in some way. However, since XMLEntityManager is used
90  * for reading files as XML, and this needs to read files as text, there would need
91  * to be some refactoring done.
92  *
93  * @author Michael Glavassevich, IBM
94  * @author Peter McCracken, IBM
95  * @author Arun Yadav, Sun Microsystems Inc.
96  *
97  * @version $Id: XIncludeTextReader.java,v 1.10 2004/04/15 04:51:56 mrglavas Exp $
98  *
99  * @see XIncludeHandler
100  */

101 public class XIncludeTextReader {
102
103     private Reader JavaDoc fReader;
104     private XIncludeHandler fHandler;
105     private XMLInputSource fSource;
106     private XMLErrorReporter fErrorReporter;
107     
108     // Content negotation parameters
109
private String JavaDoc fAccept;
110     private String JavaDoc fAcceptLanguage;
111  
112     /**
113      * Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
114      *
115      * @param source The XMLInputSource to use.
116      * @param handler The XIncludeHandler to use.
117      */

118     public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler)
119         throws IOException JavaDoc {
120         fHandler = handler;
121         fSource = source;
122     }
123     
124     /**
125      * Sets the XMLErrorReporter used for reporting errors while
126      * reading the text include.
127      *
128      * @param errorReporter the XMLErrorReporter to be used for
129      * reporting errors.
130      */

131     public void setErrorReporter(XMLErrorReporter errorReporter) {
132         fErrorReporter = errorReporter;
133     }
134     
135     /**
136      * Sets content negotation parameters to be attached to an HTTP request.
137      *
138      * @param accept the Accept HTTP request property
139      * @param acceptLanguage the Accept-Language HTTP request property
140      */

141     public void setHttpProperties(String JavaDoc accept, String JavaDoc acceptLanguage) {
142         fAccept = accept;
143         fAcceptLanguage = acceptLanguage;
144     }
145
146     /**
147      * Return the Reader for given XMLInputSource.
148      *
149      * @param source The XMLInputSource to use.
150      */

151     protected Reader JavaDoc getReader(XMLInputSource source) throws IOException JavaDoc {
152         if (source.getCharacterStream() != null) {
153             return source.getCharacterStream();
154         }
155         else {
156             InputStream JavaDoc stream = null;
157
158             String JavaDoc encoding = source.getEncoding();
159             if (encoding == null) {
160                 encoding = "UTF-8";
161             }
162             if (source.getByteStream() != null) {
163                 stream = source.getByteStream();
164                 // Wrap the InputStream so that it is possible to rewind it.
165
if (!(stream instanceof BufferedInputStream JavaDoc)) {
166                     stream = new BufferedInputStream JavaDoc(stream);
167                 }
168             }
169             else {
170                 String JavaDoc expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
171
172                 URL JavaDoc url = new URL JavaDoc(expandedSystemId);
173                 URLConnection JavaDoc urlCon = url.openConnection();
174                 
175                 // If this is an HTTP connection attach any
176
// content negotation parameters to the request.
177
if (urlCon instanceof HttpURLConnection JavaDoc) {
178                     if( fAccept != null && fAccept.length() > 0) {
179                         urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT, fAccept);
180                     }
181                     if( fAcceptLanguage != null && fAcceptLanguage.length() > 0) {
182                         urlCon.setRequestProperty(XIncludeHandler.HTTP_ACCEPT_LANGUAGE, fAcceptLanguage);
183                     }
184                 }
185                 
186                 // Wrap the InputStream so that it is possible to rewind it.
187
stream = new BufferedInputStream JavaDoc(urlCon.getInputStream());
188                 
189                 // content type will be string like "text/xml; charset=UTF-8" or "text/xml"
190
String JavaDoc rawContentType = urlCon.getContentType();
191                 
192                 // text/xml and application/xml offer only one optional parameter
193
int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
194
195                 String JavaDoc contentType = null;
196                 String JavaDoc charset = null;
197                 if (index != -1) {
198                     // this should be something like "text/xml"
199
contentType = rawContentType.substring(0, index).trim();
200
201                     // this should be something like "charset=UTF-8", but we want to
202
// strip it down to just "UTF-8"
203
charset = rawContentType.substring(index + 1).trim();
204                     if (charset.startsWith("charset=")) {
205                         // 8 is the length of "charset="
206
charset = charset.substring(8).trim();
207                         // strip quotes, if present
208
if ((charset.charAt(0) == '"'
209                             && charset.charAt(charset.length() - 1) == '"')
210                             || (charset.charAt(0) == '\''
211                                 && charset.charAt(charset.length() - 1)
212                                     == '\'')) {
213                             charset =
214                                 charset.substring(1, charset.length() - 1);
215                         }
216                     }
217                     else {
218                         charset = null;
219                     }
220                 }
221                 else {
222                     contentType = rawContentType.trim();
223                 }
224
225                 String JavaDoc detectedEncoding = null;
226                 /** The encoding of such a resource is determined by:
227                     1 external encoding information, if available, otherwise
228                          -- the most common type of external information is the "charset" parameter of a MIME package
229                     2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/*+xml or application/*+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
230                     3 the value of the encoding attribute if one exists, otherwise
231                     4 UTF-8.
232                  **/

233                 if (contentType.equals("text/xml")) {
234                     if (charset != null) {
235                         detectedEncoding = charset;
236                     }
237                     else {
238                         // see RFC2376 or 3023, section 3.1
239
detectedEncoding = "US-ASCII";
240                     }
241                 }
242                 else if (contentType.equals("application/xml")) {
243                     if (charset != null) {
244                         detectedEncoding = charset;
245                     }
246                     else {
247                         // see RFC2376 or 3023, section 3.2
248
detectedEncoding = getEncodingName(stream);
249                     }
250                 }
251                 else if (contentType.endsWith("+xml")) {
252                     detectedEncoding = getEncodingName(stream);
253                 }
254
255                 if (detectedEncoding != null) {
256                     encoding = detectedEncoding;
257                 }
258                 // else 3 or 4.
259
}
260             
261             encoding = encoding.toUpperCase(Locale.ENGLISH);
262             
263             // eat the Byte Order Mark
264
consumeBOM(stream, encoding);
265             
266             // If the document is UTF-8 or US-ASCII use
267
// the Xerces readers for these encodings. For
268
// US-ASCII consult the encoding map since
269
// this encoding has many aliases.
270
if (encoding.equals("UTF-8")) {
271                 return new UTF8Reader(stream,
272                     XMLEntityManager.DEFAULT_BUFFER_SIZE,
273                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
274                     fErrorReporter.getLocale() );
275             }
276             
277             // Try to use a Java reader.
278
String JavaDoc javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
279             
280             // If the specified encoding wasn't a recognized IANA encoding throw an IOException.
281
// The XIncludeHandler will report this as a ResourceError and then will
282
// attempt to include a fallback if there is one.
283
if (javaEncoding == null) {
284                 MessageFormatter aFormatter =
285                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
286                 Locale JavaDoc aLocale = fErrorReporter.getLocale();
287                 throw new IOException JavaDoc( aFormatter.formatMessage( aLocale,
288                     "EncodingDeclInvalid",
289                     new Object JavaDoc[] {encoding} ) );
290             }
291             else if (javaEncoding.equals("ASCII")) {
292                 return new ASCIIReader(stream,
293                     XMLEntityManager.DEFAULT_BUFFER_SIZE,
294                     fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
295                     fErrorReporter.getLocale() );
296             }
297             
298             return new InputStreamReader JavaDoc(stream, javaEncoding);
299         }
300     }
301
302     /**
303      * XMLEntityManager cares about endian-ness, since it creates its own optimized
304      * readers. Since we're just using generic Java readers for now, we're not caring
305      * about endian-ness. If this changes, even more code needs to be copied from
306      * XMLEntity manager. -- PJM
307      */

308     protected String JavaDoc getEncodingName(InputStream JavaDoc stream) throws IOException JavaDoc {
309         final byte[] b4 = new byte[4];
310         String JavaDoc encoding = null;
311
312         // this has the potential to throw an exception
313
// it will be fixed when we ensure the stream is rewindable (see note above)
314
stream.mark(4);
315         int count = stream.read(b4, 0, 4);
316         stream.reset();
317         if (count == 4) {
318             encoding = getEncodingName(b4);
319         }
320
321         return encoding;
322     }
323
324     /**
325      * Removes the byte order mark from the stream, if it exists.
326      * @param stream
327      * @param encoding
328      * @throws IOException
329      */

330     protected void consumeBOM(InputStream JavaDoc stream, String JavaDoc encoding)
331         throws IOException JavaDoc {
332
333         byte[] b = new byte[3];
334         int count = 0;
335         stream.mark(3);
336         if (encoding.equals("UTF-8")) {
337             count = stream.read(b, 0, 3);
338             if (count == 3) {
339                 int b0 = b[0] & 0xFF;
340                 int b1 = b[1] & 0xFF;
341                 int b2 = b[2] & 0xFF;
342                 if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
343                     // First three bytes are not BOM, so reset.
344
stream.reset();
345                 }
346             }
347             else {
348                 stream.reset();
349             }
350         }
351         else if (encoding.startsWith("UTF-16")) {
352             count = stream.read(b, 0, 2);
353             if (count == 2) {
354                 int b0 = b[0] & 0xFF;
355                 int b1 = b[1] & 0xFF;
356                 if ((b0 != 0xFE || b1 != 0xFF)
357                     && (b0 != 0xFF || b1 != 0xFE)) {
358                     // First two bytes are not BOM, so reset.
359
stream.reset();
360                 }
361             }
362             else {
363                 stream.reset();
364             }
365         }
366         // We could do UTF-32, but since the getEncodingName() doesn't support that
367
// we won't support it here.
368
// To implement UTF-32, look for: 00 00 FE FF for big-endian
369
// or FF FE 00 00 for little-endian
370
}
371
372     /**
373      * REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
374      * Is there any way we can share the code, without having it implemented twice?
375      * I think we should make it public and static in XMLEntityManager. --PJM
376      *
377      * Returns the IANA encoding name that is auto-detected from
378      * the bytes specified, with the endian-ness of that encoding where appropriate.
379      *
380      * @param b4 The first four bytes of the input.
381      * @return the encoding name, or null if no encoding could be detected
382      */

383     protected String JavaDoc getEncodingName(byte[] b4) {
384
385         // UTF-16, with BOM
386
int b0 = b4[0] & 0xFF;
387         int b1 = b4[1] & 0xFF;
388         if (b0 == 0xFE && b1 == 0xFF) {
389             // UTF-16, big-endian
390
return "UTF-16BE";
391         }
392         if (b0 == 0xFF && b1 == 0xFE) {
393             // UTF-16, little-endian
394
return "UTF-16LE";
395         }
396
397         // UTF-8 with a BOM
398
int b2 = b4[2] & 0xFF;
399         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
400             return "UTF-8";
401         }
402
403         // other encodings
404
int b3 = b4[3] & 0xFF;
405         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
406             // UCS-4, big endian (1234)
407
return "ISO-10646-UCS-4";
408         }
409         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
410             // UCS-4, little endian (4321)
411
return "ISO-10646-UCS-4";
412         }
413         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
414             // UCS-4, unusual octet order (2143)
415
return "ISO-10646-UCS-4";
416         }
417         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
418             // UCS-4, unusual octect order (3412)
419
return "ISO-10646-UCS-4";
420         }
421         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
422             // UTF-16, big-endian, no BOM
423
// (or could turn out to be UCS-2...
424
return "UTF-16BE";
425         }
426         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
427             // UTF-16, little-endian, no BOM
428
// (or could turn out to be UCS-2...
429
return "UTF-16LE";
430         }
431         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
432             // EBCDIC
433
// a la xerces1, return CP037 instead of EBCDIC here
434
return "CP037";
435         }
436
437         // this signals us to use the value from the encoding attribute
438
return null;
439
440     } // getEncodingName(byte[]):Object[]
441

442     /**
443      * Read the input stream as text, and pass the text on to the XIncludeHandler
444      * using calls to characters(). This will read all of the text it can from the
445      * resource.
446      *
447      * @throws IOException
448      */

449     public void parse() throws IOException JavaDoc {
450         // REVISIT: This method needs to be rewritten to improve performance: both
451
// time and memory. We should be reading chunks and reporting chunks instead
452
// of reading characters individually and reporting all the characters in
453
// one callback. Also, currently we don't provide any locator information:
454
// line number, column number, etc... so if we report an error it will appear
455
// as if the invalid XML character was in the include parent. -- mrglavas
456
XMLStringBuffer buffer = new XMLStringBuffer();
457         fReader = getReader(fSource);
458         int ch;
459         while((ch = fReader.read()) != -1) {
460             if (isValid(ch)) {
461                 buffer.append((char)ch);
462             }
463             else if (XMLChar.isHighSurrogate(ch)) {
464                 int ch2 = fReader.read();
465                 if (XMLChar.isLowSurrogate(ch2)) {
466
467                     // convert surrogates to a supplemental character
468
int sup = XMLChar.supplemental((char)ch, (char)ch2);
469
470                     // supplemental character must be a valid XML character
471
if (!isValid(sup)) {
472                         fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
473                                                    "InvalidCharInContent",
474                                                    new Object JavaDoc[] { Integer.toString(sup, 16) },
475                                                    XMLErrorReporter.SEVERITY_FATAL_ERROR);
476                         continue;
477                     }
478                     buffer.append((char) ch);
479                     buffer.append((char) ch2);
480                 }
481                 else {
482                     fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
483                                                "InvalidCharInContent",
484                                                new Object JavaDoc[] { Integer.toString(ch, 16) },
485                                                XMLErrorReporter.SEVERITY_FATAL_ERROR);
486                 }
487             }
488             else {
489                 fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
490                                            "InvalidCharInContent",
491                                            new Object JavaDoc[] { Integer.toString(ch, 16) },
492                                            XMLErrorReporter.SEVERITY_FATAL_ERROR);
493             }
494         }
495         if (fHandler != null && buffer.length > 0) {
496             fHandler.characters(
497                 buffer,
498                 fHandler.modifyAugmentations(null, true));
499         }
500     }
501     
502     /**
503      * Closes the stream. Call this after parse(), or when there is no longer any need
504      * for this object.
505      *
506      * @throws IOException
507      */

508     public void close() throws IOException JavaDoc {
509         if (fReader != null) {
510             fReader.close();
511         }
512     }
513     
514     /**
515      * Returns true if the specified character is a valid XML character
516      * as per the rules of XML 1.0.
517      *
518      * @param ch The character to check.
519      */

520     protected boolean isValid(int ch) {
521         return XMLChar.isValid(ch);
522     }
523 }
524
Popular Tags