001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 /* 019 * $Id: Encodings.java 1225414 2011-12-29 02:38:30Z mrglavas $ 020 */ 021 package org.apache.xml.serializer; 022 023 import java.io.InputStream; 024 import java.io.OutputStream; 025 import java.io.OutputStreamWriter; 026 import java.io.UnsupportedEncodingException; 027 import java.io.Writer; 028 import java.util.ArrayList; 029 import java.util.Enumeration; 030 import java.util.Hashtable; 031 import java.util.List; 032 import java.util.Properties; 033 import java.util.StringTokenizer; 034 035 036 /** 037 * Provides information about encodings. Depends on the Java runtime 038 * to provides writers for the different encodings. 039 * <p> 040 * This class is not a public API. It is only public because it 041 * is used outside of this package. 042 * 043 * @xsl.usage internal 044 */ 045 046 public final class Encodings extends Object 047 { 048 /** 049 * Standard filename for properties file with encodings data. 050 */ 051 private static final String ENCODINGS_FILE = SerializerBase.PKG_PATH+"/Encodings.properties"; 052 053 /** 054 * Returns a writer for the specified encoding based on 055 * an output stream. 056 * <p> 057 * This is not a public API. 058 * @param output The output stream 059 * @param encoding The encoding MIME name, not a Java name for the encoding. 060 * @return A suitable writer 061 * @throws UnsupportedEncodingException There is no convertor 062 * to support this encoding 063 * @xsl.usage internal 064 */ 065 static Writer getWriter(OutputStream output, String encoding) 066 throws UnsupportedEncodingException 067 { 068 069 for (int i = 0; i < _encodings.length; ++i) 070 { 071 if (_encodings[i].name.equalsIgnoreCase(encoding)) 072 { 073 try 074 { 075 String javaName = _encodings[i].javaName; 076 OutputStreamWriter osw = new OutputStreamWriter(output,javaName); 077 return osw; 078 } 079 catch (java.lang.IllegalArgumentException iae) // java 1.1.8 080 { 081 // keep trying 082 } 083 catch (UnsupportedEncodingException usee) 084 { 085 086 // keep trying 087 } 088 } 089 } 090 091 try 092 { 093 return new OutputStreamWriter(output, encoding); 094 } 095 catch (java.lang.IllegalArgumentException iae) // java 1.1.8 096 { 097 throw new UnsupportedEncodingException(encoding); 098 } 099 } 100 101 /** 102 * Returns the EncodingInfo object for the specified 103 * encoding, never null, although the encoding name 104 * inside the returned EncodingInfo object will be if 105 * we can't find a "real" EncodingInfo for the encoding. 106 * <p> 107 * This is not a public API. 108 * 109 * @param encoding The encoding 110 * @return The object that is used to determine if 111 * characters are in the given encoding. 112 * @xsl.usage internal 113 */ 114 static EncodingInfo getEncodingInfo(String encoding) 115 { 116 EncodingInfo ei; 117 118 String normalizedEncoding = toUpperCaseFast(encoding); 119 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding); 120 if (ei == null) 121 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding); 122 if (ei == null) { 123 // We shouldn't have to do this, but just in case. 124 ei = new EncodingInfo(null,null, '\u0000'); 125 } 126 127 return ei; 128 } 129 130 /** 131 * Determines if the encoding specified was recognized by the 132 * serializer or not. 133 * 134 * @param encoding The encoding 135 * @return boolean - true if the encoding was recognized else false 136 */ 137 public static boolean isRecognizedEncoding(String encoding) 138 { 139 EncodingInfo ei; 140 141 String normalizedEncoding = encoding.toUpperCase(); 142 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding); 143 if (ei == null) 144 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding); 145 if (ei != null) 146 return true; 147 return false; 148 } 149 150 /** 151 * A fast and cheap way to uppercase a String that is 152 * only made of printable ASCII characters. 153 * <p> 154 * This is not a public API. 155 * @param s a String of ASCII characters 156 * @return an uppercased version of the input String, 157 * possibly the same String. 158 * @xsl.usage internal 159 */ 160 static private String toUpperCaseFast(final String s) { 161 162 boolean different = false; 163 final int mx = s.length(); 164 char[] chars = new char[mx]; 165 for (int i=0; i < mx; i++) { 166 char ch = s.charAt(i); 167 // is the character a lower case ASCII one? 168 if ('a' <= ch && ch <= 'z') { 169 // a cheap and fast way to uppercase that is good enough 170 ch = (char) (ch + ('A' - 'a')); 171 different = true; // the uppercased String is different 172 } 173 chars[i] = ch; 174 } 175 176 // A little optimization, don't call String.valueOf() if 177 // the uppercased string is the same as the input string. 178 final String upper; 179 if (different) 180 upper = String.valueOf(chars); 181 else 182 upper = s; 183 184 return upper; 185 } 186 187 /** The default encoding, ISO style, ISO style. */ 188 static final String DEFAULT_MIME_ENCODING = "UTF-8"; 189 190 /** 191 * Get the proper mime encoding. From the XSLT recommendation: "The encoding 192 * attribute specifies the preferred encoding to use for outputting the result 193 * tree. XSLT processors are required to respect values of UTF-8 and UTF-16. 194 * For other values, if the XSLT processor does not support the specified 195 * encoding it may signal an error; if it does not signal an error it should 196 * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding 197 * whose name does not match the EncName production of the XML Recommendation 198 * [XML]. If no encoding attribute is specified, then the XSLT processor should 199 * use either UTF-8 or UTF-16." 200 * <p> 201 * This is not a public API. 202 * 203 * @param encoding Reference to java-style encoding string, which may be null, 204 * in which case a default will be found. 205 * 206 * @return The ISO-style encoding string, or null if failure. 207 * @xsl.usage internal 208 */ 209 static String getMimeEncoding(String encoding) 210 { 211 212 if (null == encoding) 213 { 214 try 215 { 216 217 // Get the default system character encoding. This may be 218 // incorrect if they passed in a writer, but right now there 219 // seems to be no way to get the encoding from a writer. 220 encoding = System.getProperty("file.encoding", "UTF8"); 221 222 if (null != encoding) 223 { 224 225 /* 226 * See if the mime type is equal to UTF8. If you don't 227 * do that, then convertJava2MimeEncoding will convert 228 * 8859_1 to "ISO-8859-1", which is not what we want, 229 * I think, and I don't think I want to alter the tables 230 * to convert everything to UTF-8. 231 */ 232 String jencoding = 233 (encoding.equalsIgnoreCase("Cp1252") 234 || encoding.equalsIgnoreCase("ISO8859_1") 235 || encoding.equalsIgnoreCase("8859_1") 236 || encoding.equalsIgnoreCase("UTF8")) 237 ? DEFAULT_MIME_ENCODING 238 : convertJava2MimeEncoding(encoding); 239 240 encoding = 241 (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING; 242 } 243 else 244 { 245 encoding = DEFAULT_MIME_ENCODING; 246 } 247 } 248 catch (SecurityException se) 249 { 250 encoding = DEFAULT_MIME_ENCODING; 251 } 252 } 253 else 254 { 255 encoding = convertJava2MimeEncoding(encoding); 256 } 257 258 return encoding; 259 } 260 261 /** 262 * Try the best we can to convert a Java encoding to a XML-style encoding. 263 * <p> 264 * This is not a public API. 265 * @param encoding non-null reference to encoding string, java style. 266 * 267 * @return ISO-style encoding string. 268 * @xsl.usage internal 269 */ 270 private static String convertJava2MimeEncoding(String encoding) 271 { 272 EncodingInfo enc = 273 (EncodingInfo) _encodingTableKeyJava.get(toUpperCaseFast(encoding)); 274 if (null != enc) 275 return enc.name; 276 return encoding; 277 } 278 279 /** 280 * Try the best we can to convert a Java encoding to a XML-style encoding. 281 * <p> 282 * This is not a public API. 283 * 284 * @param encoding non-null reference to encoding string, java style. 285 * 286 * @return ISO-style encoding string. 287 * <p> 288 * This method is not a public API. 289 * @xsl.usage internal 290 */ 291 public static String convertMime2JavaEncoding(String encoding) 292 { 293 294 for (int i = 0; i < _encodings.length; ++i) 295 { 296 if (_encodings[i].name.equalsIgnoreCase(encoding)) 297 { 298 return _encodings[i].javaName; 299 } 300 } 301 302 return encoding; 303 } 304 305 /** 306 * Load a list of all the supported encodings. 307 * 308 * System property "encodings" formatted using URL syntax may define an 309 * external encodings list. Thanks to Sergey Ushakov for the code 310 * contribution! 311 * @xsl.usage internal 312 */ 313 private static EncodingInfo[] loadEncodingInfo() 314 { 315 try 316 { 317 final InputStream is; 318 is = SecuritySupport.getResourceAsStream(ObjectFactory.findClassLoader(), 319 ENCODINGS_FILE); 320 321 Properties props = new Properties(); 322 if (is != null) { 323 props.load(is); 324 is.close(); 325 } else { 326 // Seems to be no real need to force failure here, let the 327 // system do its best... The issue is not really very critical, 328 // and the output will be in any case _correct_ though maybe not 329 // always human-friendly... :) 330 // But maybe report/log the resource problem? 331 // Any standard ways to report/log errors (in static context)? 332 } 333 334 int totalEntries = props.size(); 335 336 List encodingInfo_list = new ArrayList(); 337 Enumeration keys = props.keys(); 338 for (int i = 0; i < totalEntries; ++i) 339 { 340 String javaName = (String) keys.nextElement(); 341 String val = props.getProperty(javaName); 342 int len = lengthOfMimeNames(val); 343 344 String mimeName; 345 char highChar; 346 if (len == 0) 347 { 348 // There is no property value, only the javaName, so try and recover 349 mimeName = javaName; 350 highChar = '\u0000'; // don't know the high code point, will need to test every character 351 } 352 else 353 { 354 try { 355 // Get the substring after the Mime names 356 final String highVal = val.substring(len).trim(); 357 highChar = (char) Integer.decode(highVal).intValue(); 358 } 359 catch( NumberFormatException e) { 360 highChar = 0; 361 } 362 String mimeNames = val.substring(0, len); 363 StringTokenizer st = 364 new StringTokenizer(mimeNames, ","); 365 for (boolean first = true; 366 st.hasMoreTokens(); 367 first = false) 368 { 369 mimeName = st.nextToken(); 370 EncodingInfo ei = new EncodingInfo(mimeName, javaName, highChar); 371 encodingInfo_list.add(ei); 372 _encodingTableKeyMime.put(mimeName.toUpperCase(), ei); 373 if (first) 374 _encodingTableKeyJava.put(javaName.toUpperCase(), ei); 375 } 376 } 377 } 378 // Convert the Vector of EncodingInfo objects into an array of them, 379 // as that is the kind of thing this method returns. 380 EncodingInfo[] ret_ei = new EncodingInfo[encodingInfo_list.size()]; 381 encodingInfo_list.toArray(ret_ei); 382 return ret_ei; 383 } 384 catch (java.net.MalformedURLException mue) 385 { 386 throw new org.apache.xml.serializer.utils.WrappedRuntimeException(mue); 387 } 388 catch (java.io.IOException ioe) 389 { 390 throw new org.apache.xml.serializer.utils.WrappedRuntimeException(ioe); 391 } 392 } 393 394 /** 395 * Get the length of the Mime names within the property value 396 * @param val The value of the property, which should contain a comma 397 * separated list of Mime names, followed optionally by a space and the 398 * high char value 399 * @return 400 */ 401 private static int lengthOfMimeNames(String val) { 402 // look for the space preceding the optional high char 403 int len = val.indexOf(' '); 404 // If len is zero it means the optional part is not there, so 405 // the value must be all Mime names, so set the length appropriately 406 if (len < 0) 407 len = val.length(); 408 409 return len; 410 } 411 412 /** 413 * Return true if the character is the high member of a surrogate pair. 414 * <p> 415 * This is not a public API. 416 * @param ch the character to test 417 * @xsl.usage internal 418 */ 419 static boolean isHighUTF16Surrogate(char ch) { 420 return ('\uD800' <= ch && ch <= '\uDBFF'); 421 } 422 /** 423 * Return true if the character is the low member of a surrogate pair. 424 * <p> 425 * This is not a public API. 426 * @param ch the character to test 427 * @xsl.usage internal 428 */ 429 static boolean isLowUTF16Surrogate(char ch) { 430 return ('\uDC00' <= ch && ch <= '\uDFFF'); 431 } 432 /** 433 * Return the unicode code point represented by the high/low surrogate pair. 434 * <p> 435 * This is not a public API. 436 * @param highSurrogate the high char of the high/low pair 437 * @param lowSurrogate the low char of the high/low pair 438 * @xsl.usage internal 439 */ 440 static int toCodePoint(char highSurrogate, char lowSurrogate) { 441 int codePoint = 442 ((highSurrogate - 0xd800) << 10) 443 + (lowSurrogate - 0xdc00) 444 + 0x10000; 445 return codePoint; 446 } 447 /** 448 * Return the unicode code point represented by the char. 449 * A bit of a dummy method, since all it does is return the char, 450 * but as an int value. 451 * <p> 452 * This is not a public API. 453 * @param ch the char. 454 * @xsl.usage internal 455 */ 456 static int toCodePoint(char ch) { 457 int codePoint = ch; 458 return codePoint; 459 } 460 461 /** 462 * Characters with values at or below the high code point are 463 * in the encoding. Code point values above this one may or may 464 * not be in the encoding, but lower ones certainly are. 465 * <p> 466 * This is for performance. 467 * 468 * @param encoding The encoding 469 * @return The code point for which characters at or below this code point 470 * are in the encoding. Characters with higher code point may or may not be 471 * in the encoding. A value of zero is returned if the high code point is unknown. 472 * <p> 473 * This method is not a public API. 474 * @xsl.usage internal 475 */ 476 static public char getHighChar(String encoding) 477 { 478 final char highCodePoint; 479 EncodingInfo ei; 480 481 String normalizedEncoding = toUpperCaseFast(encoding); 482 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding); 483 if (ei == null) 484 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding); 485 if (ei != null) 486 highCodePoint = ei.getHighChar(); 487 else 488 highCodePoint = 0; 489 return highCodePoint; 490 } 491 492 private static final Hashtable _encodingTableKeyJava = new Hashtable(); 493 private static final Hashtable _encodingTableKeyMime = new Hashtable(); 494 private static final EncodingInfo[] _encodings = loadEncodingInfo(); 495 }