1 package com.healthmarketscience.jackcess.scsu;
2
3 /*
4 * This sample software accompanies Unicode Technical Report #6 and
5 * distributed as is by Unicode, Inc., subject to the following:
6 *
7 * Copyright 1996-1998 Unicode, Inc.. All Rights Reserved.
8 *
9 * Permission to use, copy, modify, and distribute this software
10 * without fee is hereby granted provided that this copyright notice
11 * appears in all copies.
12 *
13 * UNICODE, INC. MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE
14 * SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING
15 * BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
17 * UNICODE, INC., SHALL NOT BE LIABLE FOR ANY ERRORS OR OMISSIONS, AND
18 * SHALL NOT BE LIABLE FOR ANY DAMAGES, INCLUDING CONSEQUENTIAL AND
19 * INCIDENTAL DAMAGES, SUFFERED BY YOU AS A RESULT OF USING, MODIFYING
20 * OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
21 *
22 * @author Asmus Freytag
23 *
24 * @version 001 Dec 25 1996
25 * @version 002 Jun 25 1997
26 * @version 003 Jul 25 1997
27 * @version 004 Aug 25 1997
28 * @version 005 Sep 30 1998
29 *
30 * Unicode and the Unicode logo are trademarks of Unicode, Inc.,
31 * and are registered in some jurisdictions.
32 **/
33
34 /**
35 Encoding text data in Unicode often requires more storage than using
36 an existing 8-bit character set and limited to the subset of characters
37 actually found in the text. The Unicode Compression Algorithm reduces
38 the necessary storage while retaining the universality of Unicode.
39 A full description of the algorithm can be found in document
40 http://www.unicode.org/unicode/reports/tr6.html
41
42 Summary
43
44 The goal of the Unicode Compression Algorithm is the abilty to
45 * Express all code points in Unicode
46 * Approximate storage size for traditional character sets
47 * Work well for short strings
48 * Provide transparency for Latin-1 data
49 * Support very simple decoders
50 * Support simple as well as sophisticated encoders
51
52 If needed, further compression can be achieved by layering standard
53 file or disk-block based compression algorithms on top.
54
55 <H2>Features</H2>
56
57 Languages using small alphabets would contain runs of characters that
58 are coded close together in Unicode. These runs are interrupted only
59 by punctuation characters, which are themselves coded in proximity to
60 each other in Unicode (usually in the ASCII range).
61
62 Two basic mechanisms in the compression algorithm account for these two
63 cases, sliding windows and static windows. A window is an area of 128
64 consecutive characters in Unicode. In the compressed data stream, each
65 character from a sliding window would be represented as a byte between
66 0x80 and 0xFF, while a byte from 0x20 to 0x7F (as well as CR, LF, and
67 TAB) would always mean an ASCII character (or control).
68
69 <H2>Notes on the Java implementation</H2>
70
71 A limitation of Java is the exclusive use of a signed byte data type.
72 The following work arounds are required:
73
74 Copying a byte to an integer variable and adding 256 for 'negative'
75 bytes gives an integer in the range 0-255.
76
77 Values of char are between 0x0000 and 0xFFFF in Java. Arithmetic on
78 char values is unsigned.
79
80 Extended characters require an int to store them. The sign is not an
81 issue because only 1024*1024 + 65536 extended characters exist.
82
83 **/
84 public abstract class SCSU
85 {
86 /** Single Byte mode command values */
87
88 /** SQ<i>n</i> Quote from Window . <p>
89 If the following byte is less than 0x80, quote from
90 static window <i>n</i>, else quote from dynamic window <i>n</i>.
91 */
92
93 static final byte SQ0 = 0x01; // Quote from window pair 0
94 static final byte SQ1 = 0x02; // Quote from window pair 1
95 static final byte SQ2 = 0x03; // Quote from window pair 2
96 static final byte SQ3 = 0x04; // Quote from window pair 3
97 static final byte SQ4 = 0x05; // Quote from window pair 4
98 static final byte SQ5 = 0x06; // Quote from window pair 5
99 static final byte SQ6 = 0x07; // Quote from window pair 6
100 static final byte SQ7 = 0x08; // Quote from window pair 7
101
102 static final byte SDX = 0x0B; // Define a window as extended
103 static final byte Srs = 0x0C; // reserved
104
105 static final byte SQU = 0x0E; // Quote a single Unicode character
106 static final byte SCU = 0x0F; // Change to Unicode mode
107
108 /** SC<i>n</i> Change to Window <i>n</i>. <p>
109 If the following bytes are less than 0x80, interpret them
110 as command bytes or pass them through, else add the offset
111 for dynamic window <i>n</i>. */
112 static final byte SC0 = 0x10; // Select window 0
113 static final byte SC1 = 0x11; // Select window 1
114 static final byte SC2 = 0x12; // Select window 2
115 static final byte SC3 = 0x13; // Select window 3
116 static final byte SC4 = 0x14; // Select window 4
117 static final byte SC5 = 0x15; // Select window 5
118 static final byte SC6 = 0x16; // Select window 6
119 static final byte SC7 = 0x17; // Select window 7
120 static final byte SD0 = 0x18; // Define and select window 0
121 static final byte SD1 = 0x19; // Define and select window 1
122 static final byte SD2 = 0x1A; // Define and select window 2
123 static final byte SD3 = 0x1B; // Define and select window 3
124 static final byte SD4 = 0x1C; // Define and select window 4
125 static final byte SD5 = 0x1D; // Define and select window 5
126 static final byte SD6 = 0x1E; // Define and select window 6
127 static final byte SD7 = 0x1F; // Define and select window 7
128
129 static final byte UC0 = (byte) 0xE0; // Select window 0
130 static final byte UC1 = (byte) 0xE1; // Select window 1
131 static final byte UC2 = (byte) 0xE2; // Select window 2
132 static final byte UC3 = (byte) 0xE3; // Select window 3
133 static final byte UC4 = (byte) 0xE4; // Select window 4
134 static final byte UC5 = (byte) 0xE5; // Select window 5
135 static final byte UC6 = (byte) 0xE6; // Select window 6
136 static final byte UC7 = (byte) 0xE7; // Select window 7
137 static final byte UD0 = (byte) 0xE8; // Define and select window 0
138 static final byte UD1 = (byte) 0xE9; // Define and select window 1
139 static final byte UD2 = (byte) 0xEA; // Define and select window 2
140 static final byte UD3 = (byte) 0xEB; // Define and select window 3
141 static final byte UD4 = (byte) 0xEC; // Define and select window 4
142 static final byte UD5 = (byte) 0xED; // Define and select window 5
143 static final byte UD6 = (byte) 0xEE; // Define and select window 6
144 static final byte UD7 = (byte) 0xEF; // Define and select window 7
145
146 static final byte UQU = (byte) 0xF0; // Quote a single Unicode character
147 static final byte UDX = (byte) 0xF1; // Define a Window as extended
148 static final byte Urs = (byte) 0xF2; // reserved
149
150 /** constant offsets for the 8 static windows */
151 static final int staticOffset[] =
152 {
153 0x0000, // ASCII for quoted tags
154 0x0080, // Latin - 1 Supplement (for access to punctuation)
155 0x0100, // Latin Extended-A
156 0x0300, // Combining Diacritical Marks
157 0x2000, // General Punctuation
158 0x2080, // Currency Symbols
159 0x2100, // Letterlike Symbols and Number Forms
160 0x3000 // CJK Symbols and punctuation
161 };
162
163 /** initial offsets for the 8 dynamic (sliding) windows */
164 static final int initialDynamicOffset[] =
165 {
166 0x0080, // Latin-1
167 0x00C0, // Latin Extended A //@005 fixed from 0x0100
168 0x0400, // Cyrillic
169 0x0600, // Arabic
170 0x0900, // Devanagari
171 0x3040, // Hiragana
172 0x30A0, // Katakana
173 0xFF00 // Fullwidth ASCII
174 };
175
176 /** dynamic window offsets, intitialize to default values. */
177 int dynamicOffset[] =
178 {
179 initialDynamicOffset[0],
180 initialDynamicOffset[1],
181 initialDynamicOffset[2],
182 initialDynamicOffset[3],
183 initialDynamicOffset[4],
184 initialDynamicOffset[5],
185 initialDynamicOffset[6],
186 initialDynamicOffset[7]
187 };
188
189 // The following method is common to encoder and decoder
190
191 private int iWindow = 0; // current active window
192
193 /** select the active dynamic window **/
194 protected void selectWindow(int iWindow)
195 {
196 this.iWindow = iWindow;
197 }
198
199 /** select the active dynamic window **/
200 protected int getCurrentWindow()
201 {
202 return this.iWindow;
203 }
204
205 /**
206 These values are used in defineWindow
207 **/
208
209 /**
210 * Unicode code points from 3400 to E000 are not adressible by
211 * dynamic window, since in these areas no short run alphabets are
212 * found. Therefore add gapOffset to all values from gapThreshold */
213 static final int gapThreshold = 0x68;
214 static final int gapOffset = 0xAC00;
215
216 /* values between reservedStart and fixedThreshold are reserved */
217 static final int reservedStart = 0xA8;
218
219 /* use table of predefined fixed offsets for values from fixedThreshold */
220 static final int fixedThreshold = 0xF9;
221
222 /** Table of fixed predefined Offsets, and byte values that index into **/
223 static final int fixedOffset[] =
224 {
225 /* 0xF9 */ 0x00C0, // Latin-1 Letters + half of Latin Extended A
226 /* 0xFA */ 0x0250, // IPA extensions
227 /* 0xFB */ 0x0370, // Greek
228 /* 0xFC */ 0x0530, // Armenian
229 /* 0xFD */ 0x3040, // Hiragana
230 /* 0xFE */ 0x30A0, // Katakana
231 /* 0xFF */ 0xFF60 // Halfwidth Katakana
232 };
233
234 /** whether a character is compressible */
235 public static boolean isCompressible(char ch)
236 {
237 return (ch < 0x3400 || ch >= 0xE000);
238 }
239
240 /** reset is only needed to bail out after an exception and
241 restart with new input */
242 public void reset()
243 {
244
245 // reset the dynamic windows
246 for (int i = 0; i < dynamicOffset.length; i++)
247 {
248 dynamicOffset[i] = initialDynamicOffset[i];
249 }
250 this.iWindow = 0;
251 }
252 }