1 | /************************************************************************** |
2 | * X J D X G E N |
3 | * Author: Jim Breen |
4 | * Index (.xjdx) generator program fron XJDIC |
5 | * |
6 | * V2.3 - indexes JIS X 0212 (3-byte EUC) kanji |
7 | * Copyright 1998 Jim Breen <jwb@csse.monash.edu.au> |
8 | ***************************************************************************/ |
9 | /* This program is free software; you can redistribute it and/or modify |
10 | it under the terms of the GNU General Public License as published by |
11 | the Free Software Foundation; either version 1, or (at your option) |
12 | any later version. |
13 | |
14 | This program is distributed in the hope that it will be useful, |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | GNU General Public License for more details. |
18 | |
19 | You should have received a copy of the GNU General Public License |
20 | along with this program; if not, write to the Free Software |
21 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ |
22 | |
23 | /* Changed: ignore all rc stuff. use args 1 and 2 for input/output file. |
24 | -- jason */ |
25 | |
26 | /* Heavily commented, removed the unused header file, split off the |
27 | readDictionary function, removed unused functions and variables... cleaned |
28 | up the code in general. Preparing for integration to the rest of the program |
29 | |
30 | Note that this indexer has been hacked off of Jim Breen's xjdic program, |
31 | and a lot of the things which have been removed were relevant to that |
32 | program, but not to this one. |
33 | --Joe |
34 | */ |
35 | |
36 | #include <config-kiten.h> |
37 | #include <sys/stat.h> |
38 | #include <unistd.h> |
39 | #include <stdio.h> |
40 | #include <stdlib.h> |
41 | #include <ctype.h> |
42 | #include <string.h> |
43 | |
44 | #ifdef HAVE_STDINT_H |
45 | #include <stdint.h> |
46 | #endif |
47 | #ifdef HAVE_INTTYPES_H |
48 | #include <inttypes.h> |
49 | #endif |
50 | |
51 | #define TRUE 1 |
52 | #define FALSE 0 |
53 | #define SPTAG '@' |
54 | #define TOKENLIM 40 |
55 | #define INDEX_VERSION 14; /*The last time the index structure changed was Version1.4*/ |
56 | |
57 | unsigned char *db; |
58 | uint32_t *jindex; |
59 | uint32_t indlen; |
60 | |
61 | /*====== prototypes=================================================*/ |
62 | void jqsort(int32_t i, int32_t j); |
63 | int Kstrcmp(uint32_t lhs, uint32_t rhs); |
64 | int alphaoreuc(unsigned char x); |
65 | unsigned char* readDictionary(const char* dictName,uint32_t *filesize); |
66 | uint32_t buildIndex(unsigned char* dict, uint32_t dictLength); |
67 | |
68 | /*====function to Load Dictionary and load/create index table=======*/ |
69 | int main(int argc, char **argv) |
70 | { |
71 | const char *Dname; |
72 | const char *JDXname; |
73 | FILE *fp; |
74 | uint32_t diclen; |
75 | uint32_t indptr; |
76 | |
77 | printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n" ); |
78 | printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n" ); |
79 | |
80 | if (argc < 3) |
81 | { |
82 | printf("\nUSAGE: kitengen input output.xjdx\n" ); |
83 | exit(2); |
84 | } |
85 | |
86 | Dname = argv[1]; /*Name of the dictionary being scanned */ |
87 | JDXname = argv[2]; /*Name of the output file */ |
88 | printf("Commandline request to use files %s and %s \n" , Dname, JDXname); |
89 | printf("\nWARNING!! This program may take a long time to run .....\n" ); |
90 | |
91 | db = readDictionary(Dname,&diclen); /*Reads the dict, but leaves a space at the beginning*/ |
92 | diclen++; /*add one to the number of bytes considered in the file */ |
93 | db[diclen] = 10; /*set the first and final entry in the database to 10 */ |
94 | db[0] = 10; |
95 | printf("Dictionary size: %d bytes.\n" ,diclen); |
96 | |
97 | |
98 | indlen = (diclen * 3)/4; /*Make a wild guess at the index file length */ |
99 | jindex = (uint32_t *)malloc(indlen); /* and allocate it */ |
100 | if(jindex == NULL) |
101 | { |
102 | fprintf(stderr,"malloc() for index table failed.\n" ); |
103 | exit(1); |
104 | } |
105 | |
106 | printf("Parsing.... \n" ); |
107 | /*this is the dictionary parser. It places an entry in jindex for every |
108 | kana/kanji string and every alphabetic string it finds which is >=3 |
109 | characters */ |
110 | indptr = buildIndex(db,diclen); |
111 | |
112 | printf("Index entries: %d \nSorting (this is slow)......\n" ,indptr); |
113 | jqsort((int32_t)1,indptr); |
114 | |
115 | printf("Sorted\nWriting index file ....\n" ); |
116 | fp = fopen(JDXname,"wb" ); |
117 | if (fp==NULL ) |
118 | { |
119 | printf("\nCannot open %s output file\n" ,JDXname); |
120 | exit(1); |
121 | } |
122 | jindex[0] = diclen+INDEX_VERSION; /* prepend the index file size + version # */ |
123 | fwrite(jindex,sizeof(int32_t),indptr+1,fp); |
124 | fclose(fp); |
125 | |
126 | return 0; |
127 | } |
128 | |
129 | /*=========function to parse the dict file and fill the jindex global with the index====*/ |
130 | /*=========returns the size of the index file ====*/ |
131 | /* |
132 | A bit of explanation on what this thing generates is probably in order. |
133 | Essentially, it fills jindex with a large number of numbers... each number |
134 | being an offset to a byte location inside of the dictionary file. Starting |
135 | at position index 1 (second pos) |
136 | In other words... feeding this thing the dict file |
137 | "Llama X1\nJT Fred Flintstone X" |
138 | would generate: {<unmodified>,0,6,12,17}. |
139 | "X" is skipped because it is only 1 byte long. |
140 | "JT" is skipped because it is only two bytes long, the J is regular ascii |
141 | (<127), and the T is not a digit. If any of those were different, (it |
142 | was longer than 2 bytes, was an euc (kana or kanji) character, or T was |
143 | a digit) it would be included in the index. |
144 | */ |
145 | |
146 | /*First... an ugly #define to make our code a bit more readable*/ |
147 | #define INDEX_OVERFLOW_CHECK(x) {if(x > indlen/sizeof(int32_t)) { \ |
148 | printf("Index table overflow. Dictionary too large?\n"); exit(1); } } |
149 | |
150 | uint32_t buildIndex(unsigned char *dict, uint32_t dictLength) { |
151 | int nowReadingWord = FALSE; /*Boolean to track if we're mid-word in the dict */ |
152 | int currentDictCharacter; /*Current character index in the dict */ |
153 | unsigned char c; /*the current reading character*/ |
154 | unsigned char currstr[TOKENLIM]; /* String that we're currently getting */ |
155 | int currstrIndex = 0; |
156 | uint32_t indptr = 1; /* next 'slot' in the index to fill */ |
157 | int saving = FALSE; /*is what we are doing right now slated for salvation?*/ |
158 | |
159 | for (currentDictCharacter =0; currentDictCharacter < dictLength; |
160 | currentDictCharacter++) |
161 | { |
162 | c = dict[currentDictCharacter]; /* Fetch the next character */ |
163 | |
164 | if(!nowReadingWord) /*if we are NOT in the middle of reading a word */ |
165 | { |
166 | if (alphaoreuc(c) || c == SPTAG) /* if character or priority entry */ |
167 | { |
168 | nowReadingWord = TRUE; /* Mark that we're mid word */ |
169 | jindex[indptr] = currentDictCharacter; |
170 | /* copy the location of this character to our index structure */ |
171 | currstrIndex = 1; |
172 | /*mark the next position in the string to copy a char into */ |
173 | currstr[0] = c; |
174 | /*set the current string to be equal to this character so far */ |
175 | currstr[1] = '\0'; |
176 | saving = TRUE; |
177 | } |
178 | } else { /*If we're in the middle of parsing a word atm */ |
179 | |
180 | /*if it's alphanumeric or - or . copy it and increment where the |
181 | next one goes */ |
182 | if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c<='9'))) |
183 | { |
184 | currstr[currstrIndex] = c; |
185 | if(currstrIndex < TOKENLIM-1) |
186 | currstrIndex++; |
187 | } |
188 | else /* We were reading a word... and we just encountered the |
189 | end of the word */ |
190 | { |
191 | currstr[currstrIndex] = '\0'; /*null terminate the string */ |
192 | nowReadingWord = FALSE; |
193 | |
194 | /*Don't save single or dual character items where the |
195 | first item is ascii */ |
196 | if ((strlen(currstr) <= 2) && (currstr[0] < 127)) |
197 | saving = FALSE; |
198 | /*EXCEPT: Save anything that's two character where the second |
199 | is a number |
200 | Note that this might catch single 2-byte kanji as well... |
201 | but it might not*/ |
202 | if ((strlen(currstr) == 2) && (currstr[1] <= '9')) |
203 | saving = TRUE; |
204 | |
205 | /* This is a latin-character string, either longer than 2 bytes |
206 | or having an ascii digit for a second byte */ |
207 | if (saving && (currstr[0] < 127)) |
208 | { |
209 | indptr++; |
210 | INDEX_OVERFLOW_CHECK(indptr); |
211 | |
212 | /* If this is non-Japanese, and has a 'SPTAGn' tag, generate |
213 | two indices */ |
214 | if ( currstr[0] == SPTAG) |
215 | { |
216 | /*make a separate entry pointing to |
217 | the non-SPTAG'd entry (the next byte)*/ |
218 | jindex[indptr] = jindex[indptr-1]+1; |
219 | /*overwrite the SPTAG marker*/ |
220 | strcpy(currstr,currstr+1); |
221 | indptr++; |
222 | INDEX_OVERFLOW_CHECK(indptr); |
223 | } |
224 | } |
225 | |
226 | /*For strings that start with non latin characters*/ |
227 | if (saving && (currstr[0] > 127)) |
228 | { |
229 | int i; |
230 | uint32_t possav = jindex[indptr]; /*Save the current marker*/ |
231 | indptr++; |
232 | INDEX_OVERFLOW_CHECK(indptr); |
233 | |
234 | /* generate index for *every* kanji in key */ |
235 | i = 2; |
236 | /*if this is a three byte kanji, ignore the 0x8f marker */ |
237 | if (currstr[0] == 0x8f) |
238 | i++; |
239 | /*step through... two by two*/ |
240 | for ( ; i < strlen(currstr); i+=2) |
241 | { |
242 | if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f)) |
243 | { |
244 | /*Add in a specific reference to the kanji*/ |
245 | jindex[indptr] = possav+i; |
246 | indptr++; |
247 | INDEX_OVERFLOW_CHECK(indptr); |
248 | } |
249 | /*again the check if it's a three byte kanji*/ |
250 | if(currstr[i] == 0x8f) |
251 | i++; |
252 | } |
253 | } |
254 | } |
255 | } |
256 | } |
257 | indptr--; /*correct for the overshoot */ |
258 | return indptr; |
259 | } |
260 | |
261 | /*===function to read the dictionary files into array, returning filesize===*/ |
262 | /*Note: We leave a blank byte in the first byte of the returned dictionary, and |
263 | allocate an extra 99 bytes at the end */ |
264 | unsigned char* |
265 | readDictionary(const char* dictName,uint32_t *filesize) { |
266 | FILE *fp; |
267 | struct stat buf; |
268 | unsigned char *memDictionary; |
269 | int nodread; |
270 | |
271 | if(stat(dictName, &buf) != 0) /* if the dict file doesn't exist */ |
272 | { |
273 | perror(NULL); |
274 | printf("Cannot stat: %s \n" ,dictName); |
275 | exit(1); |
276 | } |
277 | |
278 | *filesize = buf.st_size; /*file size in bytes*/ |
279 | |
280 | puts ("\nLoading Dictionary file. Please wait.....\n" ); |
281 | fp=fopen(dictName,"rb" ); |
282 | if (fp==NULL ) |
283 | { |
284 | printf("\nCannot open dictionary file\n" ); |
285 | exit(1); |
286 | } |
287 | /*Allocate the database index 100 bytes larger than the dict filesize*/ |
288 | memDictionary=(unsigned char*)malloc((*filesize+100)*sizeof(unsigned char)); |
289 | if(memDictionary == NULL) |
290 | { |
291 | fprintf(stderr,"malloc() for dictionary failed.\n" ); |
292 | fclose(fp); |
293 | exit(1); |
294 | } |
295 | |
296 | nodread = (*filesize)/1024; /*number of kilobytes in the file */ |
297 | /*reads 1024 x nodread bytes from fp, storing in memDictionary at offset 1*/ |
298 | fread((unsigned char *)memDictionary+1, 1024, nodread, fp); |
299 | nodread = (*filesize) % 1024; /* "leftover" bytes after the previous read */ |
300 | /*reads the remaining bytes from fp... for what filesystem is this split-read needed?*/ |
301 | fread((unsigned char *)(memDictionary+((*filesize)/1024)*1024)+1, nodread,1, fp); |
302 | fclose(fp); |
303 | |
304 | return memDictionary; |
305 | } |
306 | |
307 | /*======function to sort jindex table====================*/ |
308 | /*see the index generator for information about what jindex contains |
309 | This simply sorts that output according to the data in the dictionary*/ |
310 | void jqsort(int32_t lhs, int32_t rhs) |
311 | { |
312 | int32_t i,last,midp; |
313 | uint32_t temp; |
314 | |
315 | if (lhs >= rhs) return; |
316 | |
317 | midp = (lhs+rhs)/2; /* calculate the midpoint */ |
318 | |
319 | /*Swap (midp,lhs) */ |
320 | temp = jindex[lhs]; |
321 | jindex[lhs] = jindex[midp]; |
322 | jindex[midp] = temp; |
323 | |
324 | last = lhs; |
325 | for (i = lhs+1;i <= rhs; i++) |
326 | { |
327 | if (Kstrcmp(jindex[i],jindex[lhs]) < 0) |
328 | { |
329 | /* Swap(++last,i);*/ |
330 | last++; |
331 | temp = jindex[i]; |
332 | jindex[i] = jindex[last]; |
333 | jindex[last] = temp; |
334 | } |
335 | } |
336 | |
337 | /* Swap (lhs,last);*/ |
338 | temp = jindex[lhs]; |
339 | jindex[lhs] = jindex[last]; |
340 | jindex[last] = temp; |
341 | |
342 | jqsort(lhs,last-1); |
343 | jqsort(last+1,rhs); |
344 | } |
345 | |
346 | /*=====string comparison used by jqsort==========================*/ |
347 | int Kstrcmp(uint32_t lhs, uint32_t rhs) |
348 | { |
349 | int i,c1 = 0, c2 = 0; |
350 | /* effectively does a strnicmp on two "strings" within the dictionary, |
351 | except it will make katakana and hirgana match (EUC A4 & A5) */ |
352 | |
353 | for (i = 0; i<20 ; i++) /*Compare up to 20 chars*/ |
354 | { |
355 | c1 = db[lhs+i]; |
356 | c2 = db[rhs+i]; |
357 | |
358 | if ((i % 2) == 0) /*If we're reading the first byte*/ |
359 | { |
360 | if (c1 == 0xA5) /*Change hiragana to katakana for */ |
361 | c1 = 0xA4; /*The purposes of this comparison */ |
362 | if (c2 == 0xA5) |
363 | c2 = 0xA4; |
364 | } |
365 | |
366 | /*If this is ascii, remove the difference between capitals and small*/ |
367 | if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20; |
368 | if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20; |
369 | |
370 | if (c1 != c2 ) break; |
371 | } |
372 | return(c1-c2); |
373 | } |
374 | |
375 | /*=======function to test a character for alpha or kana/kanji====*/ |
376 | int alphaoreuc(unsigned char x) |
377 | { |
378 | int c; |
379 | |
380 | c = x & 0xff; |
381 | if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122))) |
382 | /*ASCII alphabet*/ |
383 | { |
384 | return (TRUE); |
385 | } |
386 | if ((c >= '0') && (c <= '9')) |
387 | /*digits*/ |
388 | { |
389 | return(TRUE); |
390 | } |
391 | if ((c & 0x80) > 0) |
392 | /*EUC kanji/kana*/ |
393 | { |
394 | return(TRUE); |
395 | } |
396 | return (FALSE); |
397 | } |
398 | |
399 | |