1 package com.healthmarketscience.jackcess.scsu;
2
3 /*
4 * This sample software accompanies Unicode Technical Report #6 and
5 * distributed as is by Unicode, Inc., subject to the following:
6 *
7 * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
8 *
9 * Permission to use, copy, modify, and distribute this software
10 * without fee is hereby granted provided that this copyright notice
11 * appears in all copies.
12 *
13 * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
14 * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
15 * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
17 * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
18 * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
19 * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
20 * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
21 *
22 * @author Asmus Freytag
23 *
24 * @version 001 Dec 25 1996
25 * @version 002 Jun 25 1997
26 * @version 003 Jul 25 1997
27 * @version 004 Aug 25 1997
28 * @version 005 Sep 30 1998
29 *
30 * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
31 * and are registered in some jurisdictions.
32 **/
33
34 /**
35 Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
36
37 <H2>Notes on the Java implementation</H2>
38
39 A limitation of Java is the exclusive use of a signed byte data type.
40 The following work arounds are required:
41
42 Copying a byte to an integer variable and adding 256 for 'negative'
43 bytes gives an integer in the range 0-255.
44
45 Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
46 char values is unsigned.
47
48 Extended characters require an int to store them. The sign is not an
49 issue because only 1024*1024 + 65536 extended characters exist.
50
51 **/
52 public class Expand extends SCSU
53 {
54 /** (re-)define (and select) a dynamic window
55 A sliding window position cannot start at any Unicode value,
56 so rather than providing an absolute offset, this function takes
57 an index value which selects among the possible starting values.
58
59 Most scripts in Unicode start on or near a half-block boundary
60 so the default behaviour is to multiply the index by 0x80. Han,
61 Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
62 show very poor locality--therefore no sliding window can be set
63 there. A jumpOffset is added to the index value to skip that region,
64 and only 167 index values total are required to select all eligible
65 half-blocks.
66
67 Finally, a few scripts straddle half block boundaries. For them, a
68 table of fixed offsets is used, and the index values from 0xF9 to
69 0xFF are used to select these special offsets.
70
71 After (re-)defining a windows location it is selected so it is ready
72 for use.
73
74 Recall that all Windows are of the same length (128 code positions).
75
76 @param iWindow - index of the window to be (re-)defined
77 @param bOffset - index for the new offset value
78 **/
79 // @005 protected <-- private here and elsewhere
80 protected void defineWindow(int iWindow, byte bOffset)
81 throws IllegalInputException
82 {
83 int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
84
85 // 0 is a reserved value
86 if (iOffset == 0)
87 {
88 throw new IllegalInputException();
89 }
90 else if (iOffset < gapThreshold)
91 {
92 dynamicOffset[iWindow] = iOffset << 7;
93 }
94 else if (iOffset < reservedStart)
95 {
96 dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
97 }
98 else if (iOffset < fixedThreshold)
99 {
100 // more reserved values
101 throw new IllegalInputException("iOffset == "+iOffset);
102 }
103 else
104 {
105 dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
106 }
107
108 // make the redefined window the active one
109 selectWindow(iWindow);
110 }
111
112 /** (re-)define (and select) a window as an extended dynamic window
113 The surrogate area in Unicode allows access to 2**20 codes beyond the
114 first 64K codes by combining one of 1024 characters from the High
115 Surrogate Area with one of 1024 characters from the Low Surrogate
116 Area (see Unicode 2.0 for the details).
117
118 The tags SDX and UDX set the window such that each subsequent byte in
119 the range 80 to FF represents a surrogate pair. The following diagram
120 shows how the bits in the two bytes following the SDX or UDX, and a
121 subsequent data byte, map onto the bits in the resulting surrogate pair.
122
123 hbyte lbyte data
124 nnnwwwww zzzzzyyy 1xxxxxxx
125
126 high-surrogate low-surrogate
127 110110wwwwwzzzzz 110111yyyxxxxxxx
128
129 @param chOffset - Since the three top bits of chOffset are not needed to
130 set the location of the extended Window, they are used instead
131 to select the window, thereby reducing the number of needed command codes.
132 The bottom 13 bits of chOffset are used to calculate the offset relative to
133 a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
134 **/
135 protected void defineExtendedWindow(char chOffset)
136 {
137 // The top 3 bits of iOffsetHi are the window index
138 int iWindow = chOffset >>> 13;
139
140 // Calculate the new offset
141 dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
142
143 // make the redefined window the active one
144 selectWindow(iWindow);
145 }
146
147 /** string buffer length used by the following functions */
148 protected int iOut = 0;
149
150 /** input cursor used by the following functions */
151 protected int iIn = 0;
152
153 /** expand input that is in Unicode mode
154 @param in input byte array to be expanded
155 @param iCur starting index
156 @param sb string buffer to which to append expanded input
157 @return the index for the lastc byte processed
158 **/
159 protected int expandUnicode(byte []in, int iCur, StringBuilder sb)
160 throws IllegalInputException, EndOfInputException
161 {
162 for( ; iCur < in.length-1; iCur+=2 ) // step by 2:
163 {
164 byte b = in[iCur];
165
166 if (b >= UC0 && b <= UC7)
167 {
168 Debug.out("SelectWindow: ", b);
169 selectWindow(b - UC0);
170 return iCur;
171 }
172 else if (b >= UD0 && b <= UD7)
173 {
174 defineWindow( b - UD0, in[iCur+1]);
175 return iCur + 1;
176 }
177 else if (b == UDX)
178 {
179 if( iCur >= in.length - 2)
180 {
181 break; // buffer error
182 }
183 defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2]));
184 return iCur + 2;
185 }
186 else if (b == UQU)
187 {
188 if( iCur >= in.length - 2)
189 {
190 break; // error
191 }
192 // Skip command byte and output Unicode character
193 iCur++;
194 }
195
196 // output a Unicode character
197 char ch = charFromTwoBytes(in[iCur], in[iCur+1]);
198 sb.append(ch);
199 iOut++;
200 }
201
202 if( iCur == in.length)
203 {
204 return iCur;
205 }
206
207 // Error condition
208 throw new EndOfInputException();
209 }
210
211 /** assemble a char from two bytes
212 In Java bytes are signed quantities, while chars are unsigned
213 @return the character
214 @param hi most significant byte
215 @param lo least significant byte
216 */
217 public static char charFromTwoBytes(byte hi, byte lo)
218 {
219 char ch = (char)(lo >= 0 ? lo : 256 + lo);
220 return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8));
221 }
222
223 /** expand portion of the input that is in single byte mode **/
224 @SuppressWarnings("fallthrough")
225 protected String expandSingleByte(byte []in)
226 throws IllegalInputException, EndOfInputException
227 {
228
229 /* Allocate the output buffer. Because of control codes, generally
230 each byte of input results in fewer than one character of
231 output. Using in.length as an intial allocation length should avoid
232 the need to reallocate in mid-stream. The exception to this rule are
233 surrogates. */
234 StringBuilder sb = new StringBuilder(in.length);
235 iOut = 0;
236
237 // Loop until all input is exhausted or an error occurred
238 int iCur;
239 Loop:
240 for( iCur = 0; iCur < in.length; iCur++ )
241 {
242 // DEBUG Debug.out("Expanding: ", iCur);
243
244 // Default behaviour is that ASCII characters are passed through
245 // (staticOffset[0] == 0) and characters with the high bit on are
246 // offset by the current dynamic (or sliding) window (this.iWindow)
247 int iStaticWindow = 0;
248 int iDynamicWindow = getCurrentWindow();
249
250 switch(in[iCur])
251 {
252 // Quote from a static Window
253 case SQ0:
254 case SQ1:
255 case SQ2:
256 case SQ3:
257 case SQ4:
258 case SQ5:
259 case SQ6:
260 case SQ7:
261 Debug.out("SQn:", iStaticWindow);
262 // skip the command byte and check for length
263 if( iCur >= in.length - 1)
264 {
265 Debug.out("SQn missing argument: ", in, iCur);
266 break Loop; // buffer length error
267 }
268 // Select window pair to quote from
269 iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
270 iCur ++;
271
272 // FALL THROUGH
273
274 default:
275 // output as character
276 if(in[iCur] >= 0)
277 {
278 // use static window
279 int ch = in[iCur] + staticOffset[iStaticWindow];
280 sb.append((char)ch);
281 iOut++;
282 }
283 else
284 {
285 // use dynamic window
286 int ch = (in[iCur] + 256); // adjust for signed bytes
287 ch -= 0x80; // reduce to range 00..7F
288 ch += dynamicOffset[iDynamicWindow];
289
290 //DEBUG
291 Debug.out("Dynamic: ", (char) ch);
292
293 if (ch < 1<<16)
294 {
295 // in Unicode range, output directly
296 sb.append((char)ch);
297 iOut++;
298 }
299 else
300 {
301 // this is an extension character
302 Debug.out("Extension character: ", ch);
303
304 // compute and append the two surrogates:
305 // translate from 10000..10FFFF to 0..FFFFF
306 ch -= 0x10000;
307
308 // high surrogate = top 10 bits added to D800
309 sb.append((char)(0xD800 + (ch>>10)));
310 iOut++;
311
312 // low surrogate = bottom 10 bits added to DC00
313 sb.append((char)(0xDC00 + (ch & ~0xFC00)));
314 iOut++;
315 }
316 }
317 break;
318
319 // define a dynamic window as extended
320 case SDX:
321 iCur += 2;
322 if( iCur >= in.length)
323 {
324 Debug.out("SDn missing argument: ", in, iCur -1);
325 break Loop; // buffer length error
326 }
327 defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur]));
328 break;
329
330 // Position a dynamic Window
331 case SD0:
332 case SD1:
333 case SD2:
334 case SD3:
335 case SD4:
336 case SD5:
337 case SD6:
338 case SD7:
339 iCur ++;
340 if( iCur >= in.length)
341 {
342 Debug.out("SDn missing argument: ", in, iCur -1);
343 break Loop; // buffer length error
344 }
345 defineWindow(in[iCur-1] - SD0, in[iCur]);
346 break;
347
348 // Select a new dynamic Window
349 case SC0:
350 case SC1:
351 case SC2:
352 case SC3:
353 case SC4:
354 case SC5:
355 case SC6:
356 case SC7:
357 selectWindow(in[iCur] - SC0);
358 break;
359 case SCU:
360 // switch to Unicode mode and continue parsing
361 iCur = expandUnicode(in, iCur+1, sb);
362 // DEBUG Debug.out("Expanded Unicode range until: ", iCur);
363 break;
364
365 case SQU:
366 // directly extract one Unicode character
367 iCur += 2;
368 if( iCur >= in.length)
369 {
370 Debug.out("SQU missing argument: ", in, iCur - 2);
371 break Loop; // buffer length error
372 }
373 else
374 {
375 char ch = charFromTwoBytes(in[iCur-1], in[iCur]);
376
377 Debug.out("Quoted: ", ch);
378 sb.append(ch);
379 iOut++;
380 }
381 break;
382
383 case Srs:
384 throw new IllegalInputException();
385 // break;
386 }
387 }
388
389 if( iCur >= in.length)
390 {
391 //SUCCESS: all input used up
392 sb.setLength(iOut);
393 iIn = iCur;
394 return sb.toString();
395 }
396
397 Debug.out("Length ==" + in.length+" iCur =", iCur);
398 //ERROR: premature end of input
399 throw new EndOfInputException();
400 }
401
402 /** expand a byte array containing compressed Unicode */
403 public String expand (byte []in)
404 throws IllegalInputException, EndOfInputException
405 {
406 String str = expandSingleByte(in);
407 Debug.out("expand output: ", str.toCharArray());
408 return str;
409 }
410
411
412 /** reset is called to start with new input, w/o creating a new
413 instance */
414 @Override
415 public void reset()
416 {
417 iOut = 0;
418 iIn = 0;
419 super.reset();
420 }
421
422 public int charsWritten()
423 {
424 return iOut;
425 }
426
427 public int bytesRead()
428 {
429 return iIn;
430 }
431 }