View Javadoc

1   package com.healthmarketscience.jackcess.scsu;
2   
3   /*
4    * This sample software accompanies Unicode Technical Report #6 and
5    * distributed as is by Unicode, Inc., subject to the following:
6    *
7    * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
8    *
9    * Permission to use, copy, modify, and distribute this software
10   * without fee is hereby granted provided that this copyright notice
11   * appears in all copies.
12   *
13   * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
14   * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
15   * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
17   * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
18   * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
19   * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
20   * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
21   *
22   *  @author Asmus Freytag
23   *
24   *  @version 001 Dec 25 1996
25   *  @version 002 Jun 25 1997
26   *  @version 003 Jul 25 1997
27   *  @version 004 Aug 25 1997
28   *  @version 005 Sep 30 1998  
29   *
30   * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
31   * and are registered in some jurisdictions.
32   **/
33  
34   /**
35      Reference decoder for the Standard Compression Scheme for Unicode (SCSU)
36  
37      <H2>Notes on the Java implementation</H2>
38  
39      A limitation of Java is the exclusive use of a signed byte data type.
40      The following work arounds are required:
41  
42      Copying a byte to an integer variable and adding 256 for 'negative'
43      bytes gives an integer in the range 0-255.
44  
45      Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
46      char values is unsigned.
47  
48      Extended characters require an int to store them. The sign is not an
49      issue because only 1024*1024 + 65536 extended characters exist.
50  
51  **/
52  public class Expand extends SCSU
53  {
54      /** (re-)define (and select) a dynamic window
55      A sliding window position cannot start at any Unicode value,
56      so rather than providing an absolute offset, this function takes
57      an index value which selects among the possible starting values.
58  
59      Most scripts in Unicode start on or near a half-block boundary
60      so the default behaviour is to multiply the index by 0x80. Han,
61      Hangul, Surrogates and other scripts between 0x3400 and 0xDFFF
62      show very poor locality--therefore no sliding window can be set
63      there. A jumpOffset is added to the index value to skip that region,
64      and only 167 index values total are required to select all eligible
65      half-blocks.
66  
67      Finally, a few scripts straddle half block boundaries. For them, a
68      table of fixed offsets is used, and the index values from 0xF9 to
69      0xFF are used to select these special offsets.
70  
71      After (re-)defining a windows location it is selected so it is ready
72      for use.
73  
74      Recall that all Windows are of the same length (128 code positions).
75  
76      @param iWindow - index of the window to be (re-)defined
77      @param bOffset - index for the new offset value
78      **/
79  	// @005 protected <-- private here and elsewhere
80      protected void defineWindow(int iWindow, byte bOffset)
81          throws IllegalInputException
82      {
83          int iOffset = (bOffset < 0 ? bOffset + 256 : bOffset);
84  
85          // 0 is a reserved value
86          if (iOffset == 0)
87          {
88              throw new IllegalInputException();
89          }
90          else if (iOffset < gapThreshold)
91          {
92              dynamicOffset[iWindow] = iOffset << 7;
93          }
94          else if (iOffset < reservedStart)
95          {
96              dynamicOffset[iWindow] = (iOffset << 7) + gapOffset;
97          }
98          else if (iOffset < fixedThreshold)
99          {
100             // more reserved values
101             throw new IllegalInputException("iOffset == "+iOffset);
102         }
103         else
104         {
105             dynamicOffset[iWindow] = fixedOffset[iOffset - fixedThreshold];
106         }
107 
108         // make the redefined window the active one
109         selectWindow(iWindow);
110     }
111 
112     /** (re-)define (and select) a window as an extended dynamic window
113     The surrogate area in Unicode allows access to 2**20 codes beyond the
114     first 64K codes by combining one of 1024 characters from the High
115     Surrogate Area with one of 1024 characters from the Low Surrogate
116     Area (see Unicode 2.0 for the details).
117 
118     The tags SDX and UDX set the window such that each subsequent byte in
119     the range 80 to FF represents a surrogate pair. The following diagram
120     shows how the bits in the two bytes following the SDX or UDX, and a
121     subsequent data byte, map onto the bits in the resulting surrogate pair.
122 
123      hbyte         lbyte          data
124     nnnwwwww      zzzzzyyy      1xxxxxxx
125 
126      high-surrogate     low-surrogate
127     110110wwwwwzzzzz   110111yyyxxxxxxx
128 
129     @param chOffset - Since the three top bits of chOffset are not needed to
130     set the location of the extended Window, they are used instead
131     to select the window, thereby reducing the number of needed command codes.
132     The bottom 13 bits of chOffset are used to calculate the offset relative to
133     a 7 bit input data byte to yield the 20 bits expressed by each surrogate pair.
134     **/
135     protected void defineExtendedWindow(char chOffset)
136     {
137         // The top 3 bits of iOffsetHi are the window index
138         int iWindow = chOffset >>> 13;
139 
140         // Calculate the new offset
141         dynamicOffset[iWindow] = ((chOffset & 0x1FFF) << 7) + (1 << 16);
142 
143         // make the redefined window the active one
144         selectWindow(iWindow);
145     }
146 
147     /** string buffer length used by the following functions */
148     protected int iOut = 0;
149 
150     /** input cursor used by the following functions */
151     protected int iIn = 0;
152 
153     /** expand input that is in Unicode mode
154     @param in input byte array to be expanded
155     @param iCur starting index
156     @param sb string buffer to which to append expanded input
157     @return the index for the lastc byte processed
158     **/
159     protected int expandUnicode(byte []in, int iCur, StringBuilder sb)
160         throws IllegalInputException, EndOfInputException
161     {
162         for( ; iCur < in.length-1; iCur+=2 ) // step by 2:
163         {
164             byte b = in[iCur];
165 
166             if (b >= UC0 && b <= UC7)
167             {
168                 Debug.out("SelectWindow: ", b);
169                 selectWindow(b - UC0);
170                 return iCur;
171             }
172             else if (b >= UD0 && b <= UD7)
173             {
174                 defineWindow( b - UD0, in[iCur+1]);
175                 return iCur + 1;
176             }
177             else if (b == UDX)
178             {
179                 if( iCur >= in.length - 2)
180                 {
181                     break; // buffer error
182                 }
183                 defineExtendedWindow(charFromTwoBytes(in[iCur+1], in[iCur+2]));
184                 return iCur + 2;
185             }
186             else if (b == UQU)
187             {
188                 if( iCur >= in.length - 2)
189                 {
190                     break; // error
191                 }
192                 // Skip command byte and output Unicode character
193                 iCur++;
194             }
195 
196             // output a Unicode character
197             char ch = charFromTwoBytes(in[iCur], in[iCur+1]);
198             sb.append(ch);
199             iOut++;
200         }
201 
202         if( iCur == in.length)
203         {
204             return iCur;
205         }
206 
207         // Error condition
208         throw new EndOfInputException();
209     }
210 
211     /** assemble a char from two bytes
212     In Java bytes are signed quantities, while chars are unsigned
213     @return the character
214     @param hi most significant byte
215     @param lo least significant byte
216     */
217     public static char charFromTwoBytes(byte hi, byte lo)
218     {
219         char ch = (char)(lo >= 0 ? lo : 256 + lo);
220         return (char)(ch + (char)((hi >= 0 ? hi : 256 + hi)<<8));
221     }
222 
223     /** expand portion of the input that is in single byte mode **/
224     @SuppressWarnings("fallthrough")
225     protected String expandSingleByte(byte []in)
226         throws IllegalInputException, EndOfInputException
227     {
228 
229         /* Allocate the output buffer. Because of control codes, generally
230         each byte of input results in fewer than one character of
231         output. Using in.length as an intial allocation length should avoid
232         the need to reallocate in mid-stream. The exception to this rule are
233         surrogates. */
234         StringBuilder sb = new StringBuilder(in.length);
235         iOut = 0;
236 
237         // Loop until all input is exhausted or an error occurred
238         int iCur;
239         Loop:
240         for( iCur = 0; iCur < in.length; iCur++ )
241         {
242             // DEBUG Debug.out("Expanding: ", iCur);
243 
244             // Default behaviour is that ASCII characters are passed through
245             // (staticOffset[0] == 0) and characters with the high bit on are
246             // offset by the current dynamic (or sliding) window (this.iWindow)
247             int iStaticWindow = 0;
248             int iDynamicWindow = getCurrentWindow();
249 
250             switch(in[iCur])
251             {
252                 // Quote from a static Window
253             case SQ0:
254             case SQ1:
255             case SQ2:
256             case SQ3:
257             case SQ4:
258             case SQ5:
259             case SQ6:
260             case SQ7:
261                 Debug.out("SQn:", iStaticWindow);
262                 // skip the command byte and check for length
263                 if( iCur >= in.length - 1)
264                 {
265                     Debug.out("SQn missing argument: ", in, iCur);
266                     break Loop;  // buffer length error
267                 }
268                 // Select window pair to quote from
269                 iDynamicWindow = iStaticWindow = in[iCur] - SQ0;
270                 iCur ++;
271 
272                 // FALL THROUGH
273 
274             default:
275                 // output as character
276                 if(in[iCur] >= 0)
277                 {
278                     // use static window
279                     int ch = in[iCur] + staticOffset[iStaticWindow];
280                     sb.append((char)ch);
281                     iOut++;
282                 }
283                 else
284                 {
285                     // use dynamic window
286                     int ch = (in[iCur] + 256); // adjust for signed bytes
287                     ch -= 0x80;                // reduce to range 00..7F
288                     ch += dynamicOffset[iDynamicWindow];
289 
290                     //DEBUG
291                     Debug.out("Dynamic: ", (char) ch);
292 
293                     if (ch < 1<<16)
294                     {
295                         // in Unicode range, output directly
296                         sb.append((char)ch);
297                         iOut++;
298                     }
299                     else
300                     {
301                         // this is an extension character
302                         Debug.out("Extension character: ", ch);
303 
304                         // compute and append the two surrogates:
305                         // translate from 10000..10FFFF to 0..FFFFF
306                         ch -= 0x10000;
307 
308                         // high surrogate = top 10 bits added to D800
309                         sb.append((char)(0xD800 + (ch>>10)));
310                         iOut++;
311 
312                         // low surrogate = bottom 10 bits added to DC00
313                         sb.append((char)(0xDC00 + (ch & ~0xFC00)));
314                         iOut++;
315                     }
316                 }
317                 break;
318 
319                 // define a dynamic window as extended
320             case SDX:
321                 iCur += 2;
322                 if( iCur >= in.length)
323                 {
324                     Debug.out("SDn missing argument: ", in, iCur -1);
325                     break Loop;  // buffer length error
326                 }
327                 defineExtendedWindow(charFromTwoBytes(in[iCur-1], in[iCur]));
328                 break;
329 
330                 // Position a dynamic Window
331             case SD0:
332             case SD1:
333             case SD2:
334             case SD3:
335             case SD4:
336             case SD5:
337             case SD6:
338             case SD7:
339                 iCur ++;
340                 if( iCur >= in.length)
341                 {
342                     Debug.out("SDn missing argument: ", in, iCur -1);
343                     break Loop;  // buffer length error
344                 }
345                 defineWindow(in[iCur-1] - SD0, in[iCur]);
346                 break;
347 
348                 // Select a new dynamic Window
349             case SC0:
350             case SC1:
351             case SC2:
352             case SC3:
353             case SC4:
354             case SC5:
355             case SC6:
356             case SC7:
357                 selectWindow(in[iCur] - SC0);
358                 break;
359             case SCU:
360                 // switch to Unicode mode and continue parsing
361                 iCur = expandUnicode(in, iCur+1, sb);
362                 // DEBUG Debug.out("Expanded Unicode range until: ", iCur);
363                 break;
364 
365             case SQU:
366                 // directly extract one Unicode character
367                 iCur += 2;
368                 if( iCur >= in.length)
369                 {
370                      Debug.out("SQU missing argument: ", in, iCur - 2);
371                      break Loop;  // buffer length error
372                 }
373                 else
374                 {
375                     char ch = charFromTwoBytes(in[iCur-1], in[iCur]);
376 
377                     Debug.out("Quoted: ", ch);
378                     sb.append(ch);
379                     iOut++;
380                 }
381                 break;
382 
383              case Srs:
384                 throw new IllegalInputException();
385                 // break;
386             }
387         }
388 
389         if( iCur >= in.length)
390         {
391             //SUCCESS: all input used up
392             sb.setLength(iOut);
393             iIn = iCur;
394             return sb.toString();
395         }
396 
397         Debug.out("Length ==" + in.length+" iCur =", iCur);
398         //ERROR: premature end of input
399         throw new EndOfInputException();
400     }
401 
402     /** expand a byte array containing compressed Unicode */
403     public String expand (byte []in)
404         throws IllegalInputException, EndOfInputException
405     {
406         String str = expandSingleByte(in);
407         Debug.out("expand output: ", str.toCharArray());
408         return str;
409     }
410 
411 
412     /** reset is called to start with new input, w/o creating a new
413         instance */
414     @Override
415     public void reset()
416     {
417         iOut = 0;
418         iIn = 0;
419         super.reset();
420     }
421 
422     public int charsWritten()
423     {
424         return iOut;
425     }
426 
427     public int bytesRead()
428     {
429         return iIn;
430     }
431 }