001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 /* 019 * $Id: WriterToUTF8Buffered.java 469356 2006-10-31 03:20:34Z minchau $ 020 */ 021 package org.apache.xml.serializer; 022 023 import java.io.IOException; 024 import java.io.OutputStream; 025 import java.io.UnsupportedEncodingException; 026 import java.io.Writer; 027 028 029 /** 030 * This class writes unicode characters to a byte stream (java.io.OutputStream) 031 * as quickly as possible. It buffers the output in an internal 032 * buffer which must be flushed to the OutputStream when done. This flushing 033 * is done via the close() flush() or flushBuffer() method. 034 * 035 * This class is only used internally within Xalan. 036 * 037 * @xsl.usage internal 038 */ 039 final class WriterToUTF8Buffered extends Writer implements WriterChain 040 { 041 042 /** number of bytes that the byte buffer can hold. 043 * This is a fixed constant is used rather than m_outputBytes.lenght for performance. 044 */ 045 private static final int BYTES_MAX=16*1024; 046 /** number of characters that the character buffer can hold. 047 * This is 1/3 of the number of bytes because UTF-8 encoding 048 * can expand one unicode character by up to 3 bytes. 049 */ 050 private static final int CHARS_MAX=(BYTES_MAX/3); 051 052 // private static final int 053 054 /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */ 055 private final OutputStream m_os; 056 057 /** 058 * The internal buffer where data is stored. 059 * (sc & sb remove final to compile in JDK 1.1.8) 060 */ 061 private final byte m_outputBytes[]; 062 063 private final char m_inputChars[]; 064 065 /** 066 * The number of valid bytes in the buffer. This value is always 067 * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements 068 * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid 069 * byte data. 070 */ 071 private int count; 072 073 /** 074 * Create an buffered UTF-8 writer. 075 * 076 * 077 * @param out the underlying output stream. 078 * 079 * @throws UnsupportedEncodingException 080 */ 081 public WriterToUTF8Buffered(OutputStream out) 082 { 083 m_os = out; 084 // get 3 extra bytes to make buffer overflow checking simpler and faster 085 // we won't have to keep checking for a few extra characters 086 m_outputBytes = new byte[BYTES_MAX + 3]; 087 088 // Big enough to hold the input chars that will be transformed 089 // into output bytes in m_ouputBytes. 090 m_inputChars = new char[CHARS_MAX + 2]; 091 count = 0; 092 093 // the old body of this constructor, before the buffersize was changed to a constant 094 // this(out, 8*1024); 095 } 096 097 /** 098 * Create an buffered UTF-8 writer to write data to the 099 * specified underlying output stream with the specified buffer 100 * size. 101 * 102 * @param out the underlying output stream. 103 * @param size the buffer size. 104 * @exception IllegalArgumentException if size <= 0. 105 */ 106 // public WriterToUTF8Buffered(final OutputStream out, final int size) 107 // { 108 // 109 // m_os = out; 110 // 111 // if (size <= 0) 112 // { 113 // throw new IllegalArgumentException( 114 // SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0"); 115 // } 116 // 117 // m_outputBytes = new byte[size]; 118 // count = 0; 119 // } 120 121 /** 122 * Write a single character. The character to be written is contained in 123 * the 16 low-order bits of the given integer value; the 16 high-order bits 124 * are ignored. 125 * 126 * <p> Subclasses that intend to support efficient single-character output 127 * should override this method. 128 * 129 * @param c int specifying a character to be written. 130 * @exception IOException If an I/O error occurs 131 */ 132 public void write(final int c) throws IOException 133 { 134 135 /* If we are close to the end of the buffer then flush it. 136 * Remember the buffer can hold a few more bytes than BYTES_MAX 137 */ 138 if (count >= BYTES_MAX) 139 flushBuffer(); 140 141 if (c < 0x80) 142 { 143 m_outputBytes[count++] = (byte) (c); 144 } 145 else if (c < 0x800) 146 { 147 m_outputBytes[count++] = (byte) (0xc0 + (c >> 6)); 148 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 149 } 150 else if (c < 0x10000) 151 { 152 m_outputBytes[count++] = (byte) (0xe0 + (c >> 12)); 153 m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 154 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 155 } 156 else 157 { 158 m_outputBytes[count++] = (byte) (0xf0 + (c >> 18)); 159 m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f)); 160 m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 161 m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f)); 162 } 163 164 } 165 166 167 /** 168 * Write a portion of an array of characters. 169 * 170 * @param chars Array of characters 171 * @param start Offset from which to start writing characters 172 * @param length Number of characters to write 173 * 174 * @exception IOException If an I/O error occurs 175 * 176 * @throws java.io.IOException 177 */ 178 public void write(final char chars[], final int start, final int length) 179 throws java.io.IOException 180 { 181 182 // We multiply the length by three since this is the maximum length 183 // of the characters that we can put into the buffer. It is possible 184 // for each Unicode character to expand to three bytes. 185 186 int lengthx3 = 3*length; 187 188 if (lengthx3 >= BYTES_MAX - count) 189 { 190 // The requested length is greater than the unused part of the buffer 191 flushBuffer(); 192 193 if (lengthx3 > BYTES_MAX) 194 { 195 /* 196 * The requested length exceeds the size of the buffer. 197 * Cut the buffer up into chunks, each of which will 198 * not cause an overflow to the output buffer m_outputBytes, 199 * and make multiple recursive calls. 200 * Be careful about integer overflows in multiplication. 201 */ 202 int split = length/CHARS_MAX; 203 final int chunks; 204 if (length % CHARS_MAX > 0) 205 chunks = split + 1; 206 else 207 chunks = split; 208 int end_chunk = start; 209 for (int chunk = 1; chunk <= chunks; chunk++) 210 { 211 int start_chunk = end_chunk; 212 end_chunk = start + (int) ((((long) length) * chunk) / chunks); 213 214 // Adjust the end of the chunk if it ends on a high char 215 // of a Unicode surrogate pair and low char of the pair 216 // is not going to be in the same chunk 217 final char c = chars[end_chunk - 1]; 218 int ic = chars[end_chunk - 1]; 219 if (c >= 0xD800 && c <= 0xDBFF) { 220 // The last Java char that we were going 221 // to process is the first of a 222 // Java surrogate char pair that 223 // represent a Unicode character. 224 225 if (end_chunk < start + length) { 226 // Avoid spanning by including the low 227 // char in the current chunk of chars. 228 end_chunk++; 229 } else { 230 /* This is the last char of the last chunk, 231 * and it is the high char of a high/low pair with 232 * no low char provided. 233 * TODO: error message needed. 234 * The char array incorrectly ends in a high char 235 * of a high/low surrogate pair, but there is 236 * no corresponding low as the high is the last char 237 */ 238 end_chunk--; 239 } 240 } 241 242 243 int len_chunk = (end_chunk - start_chunk); 244 this.write(chars,start_chunk, len_chunk); 245 } 246 return; 247 } 248 } 249 250 251 252 final int n = length+start; 253 final byte[] buf_loc = m_outputBytes; // local reference for faster access 254 int count_loc = count; // local integer for faster access 255 int i = start; 256 { 257 /* This block could be omitted and the code would produce 258 * the same result. But this block exists to give the JIT 259 * a better chance of optimizing a tight and common loop which 260 * occurs when writing out ASCII characters. 261 */ 262 char c; 263 for(; i < n && (c = chars[i])< 0x80 ; i++ ) 264 buf_loc[count_loc++] = (byte)c; 265 } 266 for (; i < n; i++) 267 { 268 269 final char c = chars[i]; 270 271 if (c < 0x80) 272 buf_loc[count_loc++] = (byte) (c); 273 else if (c < 0x800) 274 { 275 buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6)); 276 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 277 } 278 /** 279 * The following else if condition is added to support XML 1.1 Characters for 280 * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 281 * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 282 * [1101 11yy] [yyxx xxxx] (low surrogate) 283 * * uuuuu = wwww + 1 284 */ 285 else if (c >= 0xD800 && c <= 0xDBFF) 286 { 287 char high, low; 288 high = c; 289 i++; 290 low = chars[i]; 291 292 buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0)); 293 buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f)); 294 buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30)); 295 buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f)); 296 } 297 else 298 { 299 buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12)); 300 buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 301 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 302 } 303 } 304 // Store the local integer back into the instance variable 305 count = count_loc; 306 307 } 308 309 /** 310 * Write a string. 311 * 312 * @param s String to be written 313 * 314 * @exception IOException If an I/O error occurs 315 */ 316 public void write(final String s) throws IOException 317 { 318 319 // We multiply the length by three since this is the maximum length 320 // of the characters that we can put into the buffer. It is possible 321 // for each Unicode character to expand to three bytes. 322 final int length = s.length(); 323 int lengthx3 = 3*length; 324 325 if (lengthx3 >= BYTES_MAX - count) 326 { 327 // The requested length is greater than the unused part of the buffer 328 flushBuffer(); 329 330 if (lengthx3 > BYTES_MAX) 331 { 332 /* 333 * The requested length exceeds the size of the buffer, 334 * so break it up in chunks that don't exceed the buffer size. 335 */ 336 final int start = 0; 337 int split = length/CHARS_MAX; 338 final int chunks; 339 if (length % CHARS_MAX > 0) 340 chunks = split + 1; 341 else 342 chunks = split; 343 int end_chunk = 0; 344 for (int chunk = 1; chunk <= chunks; chunk++) 345 { 346 int start_chunk = end_chunk; 347 end_chunk = start + (int) ((((long) length) * chunk) / chunks); 348 s.getChars(start_chunk,end_chunk, m_inputChars,0); 349 int len_chunk = (end_chunk - start_chunk); 350 351 // Adjust the end of the chunk if it ends on a high char 352 // of a Unicode surrogate pair and low char of the pair 353 // is not going to be in the same chunk 354 final char c = m_inputChars[len_chunk - 1]; 355 if (c >= 0xD800 && c <= 0xDBFF) { 356 // Exclude char in this chunk, 357 // to avoid spanning a Unicode character 358 // that is in two Java chars as a high/low surrogate 359 end_chunk--; 360 len_chunk--; 361 if (chunk == chunks) { 362 /* TODO: error message needed. 363 * The String incorrectly ends in a high char 364 * of a high/low surrogate pair, but there is 365 * no corresponding low as the high is the last char 366 * Recover by ignoring this last char. 367 */ 368 } 369 } 370 371 this.write(m_inputChars,0, len_chunk); 372 } 373 return; 374 } 375 } 376 377 378 s.getChars(0, length , m_inputChars, 0); 379 final char[] chars = m_inputChars; 380 final int n = length; 381 final byte[] buf_loc = m_outputBytes; // local reference for faster access 382 int count_loc = count; // local integer for faster access 383 int i = 0; 384 { 385 /* This block could be omitted and the code would produce 386 * the same result. But this block exists to give the JIT 387 * a better chance of optimizing a tight and common loop which 388 * occurs when writing out ASCII characters. 389 */ 390 char c; 391 for(; i < n && (c = chars[i])< 0x80 ; i++ ) 392 buf_loc[count_loc++] = (byte)c; 393 } 394 for (; i < n; i++) 395 { 396 397 final char c = chars[i]; 398 399 if (c < 0x80) 400 buf_loc[count_loc++] = (byte) (c); 401 else if (c < 0x800) 402 { 403 buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6)); 404 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 405 } 406 /** 407 * The following else if condition is added to support XML 1.1 Characters for 408 * UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* 409 * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) 410 * [1101 11yy] [yyxx xxxx] (low surrogate) 411 * * uuuuu = wwww + 1 412 */ 413 else if (c >= 0xD800 && c <= 0xDBFF) 414 { 415 char high, low; 416 high = c; 417 i++; 418 low = chars[i]; 419 420 buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0)); 421 buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f)); 422 buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30)); 423 buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f)); 424 } 425 else 426 { 427 buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12)); 428 buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f)); 429 buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f)); 430 } 431 } 432 // Store the local integer back into the instance variable 433 count = count_loc; 434 435 } 436 437 /** 438 * Flush the internal buffer 439 * 440 * @throws IOException 441 */ 442 public void flushBuffer() throws IOException 443 { 444 445 if (count > 0) 446 { 447 m_os.write(m_outputBytes, 0, count); 448 449 count = 0; 450 } 451 } 452 453 /** 454 * Flush the stream. If the stream has saved any characters from the 455 * various write() methods in a buffer, write them immediately to their 456 * intended destination. Then, if that destination is another character or 457 * byte stream, flush it. Thus one flush() invocation will flush all the 458 * buffers in a chain of Writers and OutputStreams. 459 * 460 * @exception IOException If an I/O error occurs 461 * 462 * @throws java.io.IOException 463 */ 464 public void flush() throws java.io.IOException 465 { 466 flushBuffer(); 467 m_os.flush(); 468 } 469 470 /** 471 * Close the stream, flushing it first. Once a stream has been closed, 472 * further write() or flush() invocations will cause an IOException to be 473 * thrown. Closing a previously-closed stream, however, has no effect. 474 * 475 * @exception IOException If an I/O error occurs 476 * 477 * @throws java.io.IOException 478 */ 479 public void close() throws java.io.IOException 480 { 481 flushBuffer(); 482 m_os.close(); 483 } 484 485 /** 486 * Get the output stream where the events will be serialized to. 487 * 488 * @return reference to the result stream, or null of only a writer was 489 * set. 490 */ 491 public OutputStream getOutputStream() 492 { 493 return m_os; 494 } 495 496 public Writer getWriter() 497 { 498 // Only one of getWriter() or getOutputStream() can return null 499 // This type of writer wraps an OutputStream, not a Writer. 500 return null; 501 } 502 }