View Javadoc

1   package com.healthmarketscience.jackcess.scsu;
2   
3   /*
4    * This sample software accompanies Unicode Technical Report #6 and
5    * distributed as is by Unicode, Inc., subject to the following:
6    *
7    * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
8    *
9    * Permission to use, copy, modify, and distribute this software
10   * without fee is hereby granted provided that this copyright notice
11   * appears in all copies.
12   *
13   * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
14   * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
15   * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
17   * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
18   * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
19   * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
20   * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
21   *
22   *  @author Asmus Freytag
23   *
24   *  @version 001 Dec 25 1996
25   *  @version 002 Jun 25 1997
26   *  @version 003 Jul 25 1997
27   *  @version 004 Aug 25 1997
28   *  @version 005 Sep 30 1998
29   *
30   * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
31   * and are registered in some jurisdictions.
32   **/
33  
34   /**
35      Encoding text data in Unicode often requires more storage than using
36      an existing 8-bit character set and limited to the subset of characters
37      actually found in the text. The Unicode Compression Algorithm reduces
38      the necessary storage while retaining the universality of Unicode.
39      A full description of the algorithm can be found in document
40      http://www.unicode.org/unicode/reports/tr6.html
41  
42      Summary
43  
44      The goal of the Unicode Compression Algorithm is the abilty to
45      * Express all code points in Unicode
46      * Approximate storage size for traditional character sets
47      * Work well for short strings
48      * Provide transparency for Latin-1 data
49      * Support very simple decoders
50      * Support simple as well as sophisticated encoders
51  
52      If needed, further compression can be achieved by layering standard
53      file or disk-block based compression algorithms on top.
54  
55      <H2>Features</H2>
56  
57      Languages using small alphabets would contain runs of characters that
58      are coded close together in Unicode. These runs are interrupted only
59      by punctuation characters, which are themselves coded in proximity to
60      each other in Unicode (usually in the ASCII range).
61  
62      Two basic mechanisms in the compression algorithm account for these two
63      cases, sliding windows and static windows. A window is an area of 128
64      consecutive characters in Unicode. In the compressed data stream, each
65      character from a sliding window would be represented as a byte between
66      0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
67      TAB) would always mean an ASCII character (or control).
68  
69      <H2>Notes on the Java implementation</H2>
70  
71      A limitation of Java is the exclusive use of a signed byte data type.
72      The following work arounds are required:
73  
74      Copying a byte to an integer variable and adding 256 for 'negative'
75      bytes gives an integer in the range 0-255.
76  
77      Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
78      char values is unsigned.
79  
80      Extended characters require an int to store them. The sign is not an
81      issue because only 1024*1024 + 65536 extended characters exist.
82  
83  **/
84  public abstract class SCSU
85  {
86      /** Single Byte mode command values */
87  
88      /** SQ<i>n</i> Quote from Window . <p>
89      If the following byte is less than 0x80, quote from
90      static window <i>n</i>, else quote from dynamic window <i>n</i>.
91      */
92  
93      static final byte SQ0 = 0x01; // Quote from window pair 0
94      static final byte SQ1 = 0x02; // Quote from window pair 1
95      static final byte SQ2 = 0x03; // Quote from window pair 2
96      static final byte SQ3 = 0x04; // Quote from window pair 3
97      static final byte SQ4 = 0x05; // Quote from window pair 4
98      static final byte SQ5 = 0x06; // Quote from window pair 5
99      static final byte SQ6 = 0x07; // Quote from window pair 6
100     static final byte SQ7 = 0x08; // Quote from window pair 7
101 
102     static final byte SDX = 0x0B; // Define a window as extended
103     static final byte Srs = 0x0C; // reserved
104 
105     static final byte SQU = 0x0E; // Quote a single Unicode character
106     static final byte SCU = 0x0F; // Change to Unicode mode
107 
108     /** SC<i>n</i> Change to Window <i>n</i>. <p>
109     If the following bytes are less than 0x80, interpret them
110     as command bytes or pass them through, else add the offset
111     for dynamic window <i>n</i>. */
112     static final byte SC0 = 0x10; // Select window 0
113     static final byte SC1 = 0x11; // Select window 1
114     static final byte SC2 = 0x12; // Select window 2
115     static final byte SC3 = 0x13; // Select window 3
116     static final byte SC4 = 0x14; // Select window 4
117     static final byte SC5 = 0x15; // Select window 5
118     static final byte SC6 = 0x16; // Select window 6
119     static final byte SC7 = 0x17; // Select window 7
120     static final byte SD0 = 0x18; // Define and select window 0
121     static final byte SD1 = 0x19; // Define and select window 1
122     static final byte SD2 = 0x1A; // Define and select window 2
123     static final byte SD3 = 0x1B; // Define and select window 3
124     static final byte SD4 = 0x1C; // Define and select window 4
125     static final byte SD5 = 0x1D; // Define and select window 5
126     static final byte SD6 = 0x1E; // Define and select window 6
127     static final byte SD7 = 0x1F; // Define and select window 7
128 
129     static final byte UC0 = (byte) 0xE0; // Select window 0
130     static final byte UC1 = (byte) 0xE1; // Select window 1
131     static final byte UC2 = (byte) 0xE2; // Select window 2
132     static final byte UC3 = (byte) 0xE3; // Select window 3
133     static final byte UC4 = (byte) 0xE4; // Select window 4
134     static final byte UC5 = (byte) 0xE5; // Select window 5
135     static final byte UC6 = (byte) 0xE6; // Select window 6
136     static final byte UC7 = (byte) 0xE7; // Select window 7
137     static final byte UD0 = (byte) 0xE8; // Define and select window 0
138     static final byte UD1 = (byte) 0xE9; // Define and select window 1
139     static final byte UD2 = (byte) 0xEA; // Define and select window 2
140     static final byte UD3 = (byte) 0xEB; // Define and select window 3
141     static final byte UD4 = (byte) 0xEC; // Define and select window 4
142     static final byte UD5 = (byte) 0xED; // Define and select window 5
143     static final byte UD6 = (byte) 0xEE; // Define and select window 6
144     static final byte UD7 = (byte) 0xEF; // Define and select window 7
145 
146     static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
147     static final byte UDX = (byte) 0xF1; // Define a Window as extended
148     static final byte Urs = (byte) 0xF2; // reserved
149 
150     /** constant offsets for the 8 static windows */
151     static final int staticOffset[] =
152     {
153         0x0000, // ASCII for quoted tags
154         0x0080, // Latin - 1 Supplement (for access to punctuation)
155         0x0100, // Latin Extended-A
156         0x0300, // Combining Diacritical Marks
157         0x2000, // General Punctuation
158         0x2080, // Currency Symbols
159         0x2100, // Letterlike Symbols and Number Forms
160         0x3000  // CJK Symbols and punctuation
161     };
162 
163     /** initial offsets for the 8 dynamic (sliding) windows */
164     static final int initialDynamicOffset[] =
165     {
166         0x0080, // Latin-1
167         0x00C0, // Latin Extended A   //@005 fixed from 0x0100
168         0x0400, // Cyrillic
169         0x0600, // Arabic
170         0x0900, // Devanagari
171         0x3040, // Hiragana
172         0x30A0, // Katakana
173         0xFF00  // Fullwidth ASCII
174     };
175 
176     /** dynamic window offsets, intitialize to default values. */
177     int dynamicOffset[] =
178     {
179         initialDynamicOffset[0],
180         initialDynamicOffset[1],
181         initialDynamicOffset[2],
182         initialDynamicOffset[3],
183         initialDynamicOffset[4],
184         initialDynamicOffset[5],
185         initialDynamicOffset[6],
186         initialDynamicOffset[7]
187     };
188 
189     // The following method is common to encoder and decoder
190 
191     private int iWindow = 0;    // current active window
192 
193     /** select the active dynamic window **/
194     protected void selectWindow(int iWindow)
195     {
196         this.iWindow = iWindow;
197     }
198 
199     /** select the active dynamic window **/
200     protected int getCurrentWindow()
201     {
202         return this.iWindow;
203     }
204 
205     /**
206        These values are used in defineWindow
207      **/
208 
209     /**
210      * Unicode code points from 3400 to E000 are not adressible by
211      * dynamic window, since in these areas no short run alphabets are
212      * found. Therefore add gapOffset to all values from gapThreshold */
213     static final int gapThreshold = 0x68;
214     static final int gapOffset = 0xAC00;
215 
216     /* values between reservedStart and fixedThreshold are reserved */
217     static final int reservedStart = 0xA8;
218 
219     /* use table of predefined fixed offsets for values from fixedThreshold */
220     static final int fixedThreshold = 0xF9;
221 
222     /** Table of fixed predefined Offsets, and byte values that index into  **/
223     static final int fixedOffset[] =
224     {
225         /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
226         /* 0xFA */ 0x0250, // IPA extensions
227         /* 0xFB */ 0x0370, // Greek
228         /* 0xFC */ 0x0530, // Armenian
229         /* 0xFD */ 0x3040, // Hiragana
230         /* 0xFE */ 0x30A0, // Katakana
231         /* 0xFF */ 0xFF60  // Halfwidth Katakana
232     };
233 
234     /** whether a character is compressible */
235     public static boolean isCompressible(char ch)
236     {
237         return (ch < 0x3400 || ch >= 0xE000);
238     }
239 
240     /** reset is only needed to bail out after an exception and
241         restart with new input */
242     public void reset()
243     {
244 
245         // reset the dynamic windows
246         for (int i = 0; i < dynamicOffset.length; i++)
247         {
248             dynamicOffset[i] = initialDynamicOffset[i];
249         }
250         this.iWindow = 0;
251     }
252 }