001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import java.io.BufferedInputStream; 020import java.io.BufferedReader; 021import java.io.File; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.Reader; 026import java.io.StringReader; 027import java.net.HttpURLConnection; 028import java.net.URL; 029import java.net.URLConnection; 030import java.nio.charset.Charset; 031import java.nio.charset.StandardCharsets; 032import java.nio.file.Files; 033import java.nio.file.Path; 034import java.text.MessageFormat; 035import java.util.Locale; 036import java.util.Objects; 037import java.util.regex.Matcher; 038import java.util.regex.Pattern; 039 040import org.apache.commons.io.ByteOrderMark; 041import org.apache.commons.io.Charsets; 042import org.apache.commons.io.IOUtils; 043import org.apache.commons.io.build.AbstractStreamBuilder; 044import org.apache.commons.io.function.IOConsumer; 045import org.apache.commons.io.output.XmlStreamWriter; 046 047/** 048 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream. 049 * <p> 050 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream. 051 * </p> 052 * <p> 053 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100% 054 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers). 055 * </p> 056 * <p> 057 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors. 058 * </p> 059 * <p> 060 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML 061 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining 062 * the character encoding of a feed</a>. 063 * </p> 064 * <p> 065 * To build an instance, see {@link Builder}. 066 * </p> 067 * <p> 068 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under Apache License 2.0. 069 * </p> 070 * 071 * @see org.apache.commons.io.output.XmlStreamWriter 072 * @since 2.0 073 */ 074public class XmlStreamReader extends Reader { 075 076 /** 077 * Builds a new {@link XmlStreamWriter} instance. 078 * 079 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 080 * <p> 081 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 082 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 083 * </p> 084 * <p> 085 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 086 * </p> 087 * <p> 088 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 089 * </p> 090 * <p> 091 * Else if the XML prolog had a charset encoding that encoding is used. 092 * </p> 093 * <p> 094 * Else if the content type had a charset encoding that encoding is used. 095 * </p> 096 * <p> 097 * Else 'UTF-8' is used. 098 * </p> 099 * <p> 100 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 101 * </p> 102 * <p> 103 * For example: 104 * </p> 105 * 106 * <pre>{@code 107 * XmlStreamReader r = XmlStreamReader.builder() 108 * .setPath(path) 109 * .setCharset(StandardCharsets.UTF_8) 110 * .get();} 111 * </pre> 112 * 113 * @since 2.12.0 114 */ 115 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> { 116 117 private boolean nullCharset = true; 118 private boolean lenient = true; 119 private String httpContentType; 120 121 /** 122 * Constructs a new instance. 123 * <p> 124 * This builder use the aspect InputStream, OpenOption[], httpContentType, lenient, and defaultEncoding. 125 * </p> 126 * <p> 127 * You must provide an origin that can be converted to an InputStream by this builder, otherwise, this call will throw an 128 * {@link UnsupportedOperationException}. 129 * </p> 130 * 131 * @return a new instance. 132 * @throws UnsupportedOperationException if the origin cannot provide an InputStream. 133 * @throws IOException thrown if there is a problem reading the stream. 134 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 135 * @see #getInputStream() 136 */ 137 @SuppressWarnings("resource") 138 @Override 139 public XmlStreamReader get() throws IOException { 140 final String defaultEncoding = nullCharset ? null : getCharset().name(); 141 // @formatter:off 142 return httpContentType == null 143 ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding) 144 : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding); 145 // @formatter:on 146 } 147 148 @Override 149 public Builder setCharset(final Charset charset) { 150 nullCharset = charset == null; 151 return super.setCharset(charset); 152 } 153 154 @Override 155 public Builder setCharset(final String charset) { 156 nullCharset = charset == null; 157 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault())); 158 } 159 160 public Builder setHttpContentType(final String httpContentType) { 161 this.httpContentType = httpContentType; 162 return this; 163 } 164 165 public Builder setLenient(final boolean lenient) { 166 this.lenient = lenient; 167 return this; 168 } 169 170 } 171 172 private static final String UTF_8 = StandardCharsets.UTF_8.name(); 173 174 private static final String US_ASCII = StandardCharsets.US_ASCII.name(); 175 176 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name(); 177 178 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name(); 179 180 private static final String UTF_32BE = "UTF-32BE"; 181 182 private static final String UTF_32LE = "UTF-32LE"; 183 184 private static final String UTF_16 = StandardCharsets.UTF_16.name(); 185 186 private static final String UTF_32 = "UTF-32"; 187 188 private static final String EBCDIC = "CP1047"; 189 190 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, 191 ByteOrderMark.UTF_32LE }; 192 193 /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */ 194 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 195 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 196 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D), 197 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00), 198 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) }; 199 200 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 201 202 /** 203 * Pattern capturing the encoding of the "xml" processing instruction. 204 */ 205 public static final Pattern ENCODING_PATTERN = Pattern.compile("<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE); 206 207 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 208 209 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 210 211 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL"; 212 213 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 214 215 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME"; 216 217 /** 218 * Constructs a new {@link Builder}. 219 * 220 * @return a new {@link Builder}. 221 * @since 2.12.0 222 */ 223 public static Builder builder() { 224 return new Builder(); 225 } 226 227 /** 228 * Gets the charset parameter value, NULL if not present, NULL if httpContentType is NULL. 229 * 230 * @param httpContentType the HTTP content type 231 * @return The content type encoding (upcased) 232 */ 233 static String getContentTypeEncoding(final String httpContentType) { 234 String encoding = null; 235 if (httpContentType != null) { 236 final int i = httpContentType.indexOf(";"); 237 if (i > -1) { 238 final String postMime = httpContentType.substring(i + 1); 239 final Matcher m = CHARSET_PATTERN.matcher(postMime); 240 encoding = m.find() ? m.group(1) : null; 241 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null; 242 } 243 } 244 return encoding; 245 } 246 247 /** 248 * Gets the MIME type or NULL if httpContentType is NULL. 249 * 250 * @param httpContentType the HTTP content type 251 * @return The mime content type 252 */ 253 static String getContentTypeMime(final String httpContentType) { 254 String mime = null; 255 if (httpContentType != null) { 256 final int i = httpContentType.indexOf(";"); 257 if (i >= 0) { 258 mime = httpContentType.substring(0, i); 259 } else { 260 mime = httpContentType; 261 } 262 mime = mime.trim(); 263 } 264 return mime; 265 } 266 267 /** 268 * Gets the encoding declared in the <?xml encoding=...?>, NULL if none. 269 * 270 * @param inputStream InputStream to create the reader from. 271 * @param guessedEnc guessed encoding 272 * @return the encoding declared in the <?xml encoding=...?> 273 * @throws IOException thrown if there is a problem reading the stream. 274 */ 275 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException { 276 String encoding = null; 277 if (guessedEnc != null) { 278 final byte[] bytes = IOUtils.byteArray(); 279 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE); 280 int offset = 0; 281 int max = IOUtils.DEFAULT_BUFFER_SIZE; 282 int c = inputStream.read(bytes, offset, max); 283 int firstGT = -1; 284 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) 285 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) { 286 offset += c; 287 max -= c; 288 c = inputStream.read(bytes, offset, max); 289 xmlProlog = new String(bytes, 0, offset, guessedEnc); 290 firstGT = xmlProlog.indexOf('>'); 291 } 292 if (firstGT == -1) { 293 if (c == -1) { 294 throw new IOException("Unexpected end of XML stream"); 295 } 296 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes"); 297 } 298 final int bytesRead = offset; 299 if (bytesRead > 0) { 300 inputStream.reset(); 301 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1))); 302 final StringBuilder prolog = new StringBuilder(); 303 IOConsumer.forEach(bReader.lines(), prolog::append); 304 final Matcher m = ENCODING_PATTERN.matcher(prolog); 305 if (m.find()) { 306 encoding = m.group(1).toUpperCase(Locale.ROOT); 307 encoding = encoding.substring(1, encoding.length() - 1); 308 } 309 } 310 } 311 return encoding; 312 } 313 314 /** 315 * Tests if the MIME type belongs to the APPLICATION XML family. 316 * 317 * @param mime The mime type 318 * @return true if the mime type belongs to the APPLICATION XML family, otherwise false 319 */ 320 static boolean isAppXml(final String mime) { 321 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity") 322 || mime.startsWith("application/") && mime.endsWith("+xml")); 323 } 324 325 /** 326 * Tests if the MIME type belongs to the TEXT XML family. 327 * 328 * @param mime The mime type 329 * @return true if the mime type belongs to the TEXT XML family, otherwise false 330 */ 331 static boolean isTextXml(final String mime) { 332 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml")); 333 } 334 335 private final Reader reader; 336 337 private final String encoding; 338 339 private final String defaultEncoding; 340 341 /** 342 * Constructs a Reader for a File. 343 * <p> 344 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. 345 * </p> 346 * <p> 347 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 348 * </p> 349 * 350 * @param file File to create a Reader from. 351 * @throws NullPointerException if the input is {@code null}. 352 * @throws IOException thrown if there is a problem reading the file. 353 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 354 */ 355 @Deprecated 356 public XmlStreamReader(final File file) throws IOException { 357 this(Objects.requireNonNull(file, "file").toPath()); 358 } 359 360 /** 361 * Constructs a Reader for a raw InputStream. 362 * <p> 363 * It follows the same logic used for files. 364 * </p> 365 * <p> 366 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 367 * </p> 368 * 369 * @param inputStream InputStream to create a Reader from. 370 * @throws NullPointerException if the input stream is {@code null}. 371 * @throws IOException thrown if there is a problem reading the stream. 372 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 373 */ 374 @Deprecated 375 public XmlStreamReader(final InputStream inputStream) throws IOException { 376 this(inputStream, true); 377 } 378 379 /** 380 * Constructs a Reader for a raw InputStream. 381 * <p> 382 * It follows the same logic used for files. 383 * </p> 384 * <p> 385 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 386 * </p> 387 * <p> 388 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 389 * </p> 390 * <p> 391 * Else if the XML prolog had a charset encoding that encoding is used. 392 * </p> 393 * <p> 394 * Else if the content type had a charset encoding that encoding is used. 395 * </p> 396 * <p> 397 * Else 'UTF-8' is used. 398 * </p> 399 * <p> 400 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 401 * </p> 402 * 403 * @param inputStream InputStream to create a Reader from. 404 * @param lenient indicates if the charset encoding detection should be relaxed. 405 * @throws NullPointerException if the input stream is {@code null}. 406 * @throws IOException thrown if there is a problem reading the stream. 407 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 408 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 409 */ 410 @Deprecated 411 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException { 412 this(inputStream, lenient, null); 413 } 414 415 /** 416 * Constructs a Reader for a raw InputStream. 417 * <p> 418 * It follows the same logic used for files. 419 * </p> 420 * <p> 421 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 422 * </p> 423 * <p> 424 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 425 * </p> 426 * <p> 427 * Else if the XML prolog had a charset encoding that encoding is used. 428 * </p> 429 * <p> 430 * Else if the content type had a charset encoding that encoding is used. 431 * </p> 432 * <p> 433 * Else 'UTF-8' is used. 434 * </p> 435 * <p> 436 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 437 * </p> 438 * 439 * @param inputStream InputStream to create a Reader from. 440 * @param lenient indicates if the charset encoding detection should be relaxed. 441 * @param defaultEncoding The default encoding 442 * @throws NullPointerException if the input stream is {@code null}. 443 * @throws IOException thrown if there is a problem reading the stream. 444 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 445 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 446 */ 447 @Deprecated 448 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 449 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException { 450 Objects.requireNonNull(inputStream, "inputStream"); 451 this.defaultEncoding = defaultEncoding; 452 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS); 453 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 454 this.encoding = doRawStream(bom, pis, lenient); 455 this.reader = new InputStreamReader(pis, encoding); 456 } 457 458 /** 459 * Constructs a Reader using an InputStream and the associated content-type header. 460 * <p> 461 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 462 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 463 * </p> 464 * <p> 465 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 466 * </p> 467 * 468 * @param inputStream InputStream to create the reader from. 469 * @param httpContentType content-type header to use for the resolution of the charset encoding. 470 * @throws NullPointerException if the input stream is {@code null}. 471 * @throws IOException thrown if there is a problem reading the file. 472 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 473 */ 474 @Deprecated 475 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException { 476 this(inputStream, httpContentType, true); 477 } 478 479 /** 480 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 481 * <p> 482 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 483 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 484 * </p> 485 * <p> 486 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 487 * </p> 488 * <p> 489 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 490 * </p> 491 * <p> 492 * Else if the XML prolog had a charset encoding that encoding is used. 493 * </p> 494 * <p> 495 * Else if the content type had a charset encoding that encoding is used. 496 * </p> 497 * <p> 498 * Else 'UTF-8' is used. 499 * </p> 500 * <p> 501 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 502 * </p> 503 * 504 * @param inputStream InputStream to create the reader from. 505 * @param httpContentType content-type header to use for the resolution of the charset encoding. 506 * @param lenient indicates if the charset encoding detection should be relaxed. 507 * @throws NullPointerException if the input stream is {@code null}. 508 * @throws IOException thrown if there is a problem reading the file. 509 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 510 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 511 */ 512 @Deprecated 513 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException { 514 this(inputStream, httpContentType, lenient, null); 515 } 516 517 /** 518 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 519 * <p> 520 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 521 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 522 * </p> 523 * <p> 524 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 525 * </p> 526 * <p> 527 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 528 * </p> 529 * <p> 530 * Else if the XML prolog had a charset encoding that encoding is used. 531 * </p> 532 * <p> 533 * Else if the content type had a charset encoding that encoding is used. 534 * </p> 535 * <p> 536 * Else 'UTF-8' is used. 537 * </p> 538 * <p> 539 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 540 * </p> 541 * 542 * @param inputStream InputStream to create the reader from. 543 * @param httpContentType content-type header to use for the resolution of the charset encoding. 544 * @param lenient indicates if the charset encoding detection should be relaxed. 545 * @param defaultEncoding The default encoding 546 * @throws NullPointerException if the input stream is {@code null}. 547 * @throws IOException thrown if there is a problem reading the file. 548 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 549 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 550 */ 551 @Deprecated 552 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 553 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding) 554 throws IOException { 555 Objects.requireNonNull(inputStream, "inputStream"); 556 this.defaultEncoding = defaultEncoding; 557 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS); 558 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 559 this.encoding = processHttpStream(bom, pis, httpContentType, lenient); 560 this.reader = new InputStreamReader(pis, encoding); 561 } 562 563 /** 564 * Constructs a Reader for a File. 565 * <p> 566 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. 567 * </p> 568 * <p> 569 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 570 * </p> 571 * 572 * @param file File to create a Reader from. 573 * @throws NullPointerException if the input is {@code null}. 574 * @throws IOException thrown if there is a problem reading the file. 575 * @since 2.11.0 576 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 577 */ 578 @Deprecated 579 @SuppressWarnings("resource") // InputStream is managed through another reader in this instance. 580 public XmlStreamReader(final Path file) throws IOException { 581 this(Files.newInputStream(Objects.requireNonNull(file, "file"))); 582 } 583 584 /** 585 * Constructs a Reader using the InputStream of a URL. 586 * <p> 587 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files. 588 * </p> 589 * <p> 590 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type. 591 * </p> 592 * <p> 593 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 594 * </p> 595 * 596 * @param url URL to create a Reader from. 597 * @throws NullPointerException if the input is {@code null}. 598 * @throws IOException thrown if there is a problem reading the stream of the URL. 599 */ 600 public XmlStreamReader(final URL url) throws IOException { 601 this(Objects.requireNonNull(url, "url").openConnection(), null); 602 } 603 604 /** 605 * Constructs a Reader using the InputStream of a URLConnection. 606 * <p> 607 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files. 608 * </p> 609 * <p> 610 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with 611 * content-type. 612 * </p> 613 * <p> 614 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 615 * </p> 616 * 617 * @param urlConnection URLConnection to create a Reader from. 618 * @param defaultEncoding The default encoding 619 * @throws NullPointerException if the input is {@code null}. 620 * @throws IOException thrown if there is a problem reading the stream of the URLConnection. 621 */ 622 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException { 623 Objects.requireNonNull(urlConnection, "urlConnection"); 624 this.defaultEncoding = defaultEncoding; 625 final boolean lenient = true; 626 final String contentType = urlConnection.getContentType(); 627 final InputStream inputStream = urlConnection.getInputStream(); 628 @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance 629 // @formatter:off 630 final BOMInputStream bomInput = BOMInputStream.builder() 631 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE)) 632 .setInclude(false) 633 .setByteOrderMarks(BOMS) 634 .get(); 635 @SuppressWarnings("resource") 636 final BOMInputStream piInput = BOMInputStream.builder() 637 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE)) 638 .setInclude(true) 639 .setByteOrderMarks(XML_GUESS_BYTES) 640 .get(); 641 // @formatter:on 642 if (urlConnection instanceof HttpURLConnection || contentType != null) { 643 this.encoding = processHttpStream(bomInput, piInput, contentType, lenient); 644 } else { 645 this.encoding = doRawStream(bomInput, piInput, lenient); 646 } 647 this.reader = new InputStreamReader(piInput, encoding); 648 } 649 650 /** 651 * Calculates the HTTP encoding. 652 * 653 * @param httpContentType The HTTP content type 654 * @param bomEnc BOM encoding 655 * @param xmlGuessEnc XML Guess encoding 656 * @param xmlEnc XML encoding 657 * @param lenient indicates if the charset encoding detection should be relaxed. 658 * @return the HTTP encoding 659 * @throws IOException thrown if there is a problem reading the stream. 660 */ 661 String calculateHttpEncoding(final String httpContentType, final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient) 662 throws IOException { 663 664 // Lenient and has XML encoding 665 if (lenient && xmlEnc != null) { 666 return xmlEnc; 667 } 668 669 // Determine mime/encoding content types from HTTP Content Type 670 final String cTMime = getContentTypeMime(httpContentType); 671 final String cTEnc = getContentTypeEncoding(httpContentType); 672 final boolean appXml = isAppXml(cTMime); 673 final boolean textXml = isTextXml(cTMime); 674 675 // Mime type NOT "application/xml" or "text/xml" 676 if (!appXml && !textXml) { 677 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 678 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 679 } 680 681 // No content type encoding 682 if (cTEnc == null) { 683 if (appXml) { 684 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 685 } 686 return defaultEncoding == null ? US_ASCII : defaultEncoding; 687 } 688 689 // UTF-16BE or UTF-16LE content type encoding 690 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 691 if (bomEnc != null) { 692 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 693 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 694 } 695 return cTEnc; 696 } 697 698 // UTF-16 content type encoding 699 if (cTEnc.equals(UTF_16)) { 700 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 701 return bomEnc; 702 } 703 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 704 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 705 } 706 707 // UTF-32BE or UTF-132E content type encoding 708 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { 709 if (bomEnc != null) { 710 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 711 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 712 } 713 return cTEnc; 714 } 715 716 // UTF-32 content type encoding 717 if (cTEnc.equals(UTF_32)) { 718 if (bomEnc != null && bomEnc.startsWith(UTF_32)) { 719 return bomEnc; 720 } 721 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 722 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 723 } 724 725 return cTEnc; 726 } 727 728 /** 729 * Calculate the raw encoding. 730 * 731 * @param bomEnc BOM encoding 732 * @param xmlGuessEnc XML Guess encoding 733 * @param xmlEnc XML encoding 734 * @return the raw encoding 735 * @throws IOException thrown if there is a problem reading the stream. 736 */ 737 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException { 738 739 // BOM is Null 740 if (bomEnc == null) { 741 if (xmlGuessEnc == null || xmlEnc == null) { 742 return defaultEncoding == null ? UTF_8 : defaultEncoding; 743 } 744 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 745 return xmlGuessEnc; 746 } 747 return xmlEnc; 748 } 749 750 // BOM is UTF-8 751 if (bomEnc.equals(UTF_8)) { 752 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 753 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 754 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 755 } 756 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 757 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 758 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 759 } 760 return bomEnc; 761 } 762 763 // BOM is UTF-16BE or UTF-16LE 764 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 765 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 766 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 767 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 768 } 769 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 770 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 771 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 772 } 773 return bomEnc; 774 } 775 776 // BOM is UTF-32BE or UTF-32LE 777 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 778 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 779 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 780 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 781 } 782 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) { 783 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 784 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 785 } 786 return bomEnc; 787 } 788 789 // BOM is something else 790 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc); 791 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 792 } 793 794 /** 795 * Closes the XmlStreamReader stream. 796 * 797 * @throws IOException thrown if there was a problem closing the stream. 798 */ 799 @Override 800 public void close() throws IOException { 801 reader.close(); 802 } 803 804 /** 805 * Does lenient detection. 806 * 807 * @param httpContentType content-type header to use for the resolution of the charset encoding. 808 * @param ex The thrown exception 809 * @return the encoding 810 * @throws IOException thrown if there is a problem reading the stream. 811 */ 812 private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException { 813 if (httpContentType != null && httpContentType.startsWith("text/html")) { 814 httpContentType = httpContentType.substring("text/html".length()); 815 httpContentType = "text/xml" + httpContentType; 816 try { 817 return calculateHttpEncoding(httpContentType, ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true); 818 } catch (final XmlStreamReaderException ex2) { 819 ex = ex2; 820 } 821 } 822 String encoding = ex.getXmlEncoding(); 823 if (encoding == null) { 824 encoding = ex.getContentTypeEncoding(); 825 } 826 if (encoding == null) { 827 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 828 } 829 return encoding; 830 } 831 832 /** 833 * Process the raw stream. 834 * 835 * @param bom BOMInputStream to detect byte order marks 836 * @param pis BOMInputStream to guess XML encoding 837 * @param lenient indicates if the charset encoding detection should be relaxed. 838 * @return the encoding to be used 839 * @throws IOException thrown if there is a problem reading the stream. 840 */ 841 private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient) throws IOException { 842 final String bomEnc = bom.getBOMCharsetName(); 843 final String xmlGuessEnc = pis.getBOMCharsetName(); 844 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc); 845 try { 846 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 847 } catch (final XmlStreamReaderException ex) { 848 if (lenient) { 849 return doLenientDetection(null, ex); 850 } 851 throw ex; 852 } 853 } 854 855 /** 856 * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate. 857 * <p> 858 * If it is NULL the content-type based rules are used. 859 * </p> 860 * 861 * @return the default encoding to use. 862 */ 863 public String getDefaultEncoding() { 864 return defaultEncoding; 865 } 866 867 /** 868 * Gets the charset encoding of the XmlStreamReader. 869 * 870 * @return charset encoding. 871 */ 872 public String getEncoding() { 873 return encoding; 874 } 875 876 /** 877 * Processes an HTTP stream. 878 * 879 * @param bomInput BOMInputStream to detect byte order marks 880 * @param piInput BOMInputStream to guess XML encoding 881 * @param httpContentType The HTTP content type 882 * @param lenient indicates if the charset encoding detection should be relaxed. 883 * @return the encoding to be used 884 * @throws IOException thrown if there is a problem reading the stream. 885 */ 886 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final String httpContentType, final boolean lenient) 887 throws IOException { 888 final String bomEnc = bomInput.getBOMCharsetName(); 889 final String xmlGuessEnc = piInput.getBOMCharsetName(); 890 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc); 891 try { 892 return calculateHttpEncoding(httpContentType, bomEnc, xmlGuessEnc, xmlEnc, lenient); 893 } catch (final XmlStreamReaderException ex) { 894 if (lenient) { 895 return doLenientDetection(httpContentType, ex); 896 } 897 throw ex; 898 } 899 } 900 901 /** 902 * Reads the underlying reader's {@code read(char[], int, int)} method. 903 * 904 * @param buf the buffer to read the characters into 905 * @param offset The start offset 906 * @param len The number of bytes to read 907 * @return the number of characters read or -1 if the end of stream 908 * @throws IOException if an I/O error occurs. 909 */ 910 @Override 911 public int read(final char[] buf, final int offset, final int len) throws IOException { 912 return reader.read(buf, offset, len); 913 } 914 915}