1/**************************************************************************
2* X J D X G E N
3* Author: Jim Breen
4* Index (.xjdx) generator program fron XJDIC
5*
6* V2.3 - indexes JIS X 0212 (3-byte EUC) kanji
7* Copyright 1998 Jim Breen <jwb@csse.monash.edu.au>
8***************************************************************************/
9/* This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 1, or (at your option)
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23/* Changed: ignore all rc stuff. use args 1 and 2 for input/output file.
24 -- jason */
25
26/* Heavily commented, removed the unused header file, split off the
27 readDictionary function, removed unused functions and variables... cleaned
28 up the code in general. Preparing for integration to the rest of the program
29
30 Note that this indexer has been hacked off of Jim Breen's xjdic program,
31 and a lot of the things which have been removed were relevant to that
32 program, but not to this one.
33 --Joe
34 */
35
36#include <config-kiten.h>
37#include <sys/stat.h>
38#include <unistd.h>
39#include <stdio.h>
40#include <stdlib.h>
41#include <ctype.h>
42#include <string.h>
43
44#ifdef HAVE_STDINT_H
45#include <stdint.h>
46#endif
47#ifdef HAVE_INTTYPES_H
48#include <inttypes.h>
49#endif
50
51#define TRUE 1
52#define FALSE 0
53#define SPTAG '@'
54#define TOKENLIM 40
55#define INDEX_VERSION 14; /*The last time the index structure changed was Version1.4*/
56
57unsigned char *db;
58uint32_t *jindex;
59uint32_t indlen;
60
61/*====== prototypes=================================================*/
62void jqsort(int32_t i, int32_t j);
63int Kstrcmp(uint32_t lhs, uint32_t rhs);
64int alphaoreuc(unsigned char x);
65unsigned char* readDictionary(const char* dictName,uint32_t *filesize);
66uint32_t buildIndex(unsigned char* dict, uint32_t dictLength);
67
68/*====function to Load Dictionary and load/create index table=======*/
69int main(int argc, char **argv)
70{
71 const char *Dname;
72 const char *JDXname;
73 FILE *fp;
74 uint32_t diclen;
75 uint32_t indptr;
76
77 printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n");
78 printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n");
79
80 if (argc < 3)
81 {
82 printf("\nUSAGE: kitengen input output.xjdx\n");
83 exit(2);
84 }
85
86 Dname = argv[1]; /*Name of the dictionary being scanned */
87 JDXname = argv[2]; /*Name of the output file */
88 printf("Commandline request to use files %s and %s \n", Dname, JDXname);
89 printf("\nWARNING!! This program may take a long time to run .....\n");
90
91 db = readDictionary(Dname,&diclen); /*Reads the dict, but leaves a space at the beginning*/
92 diclen++; /*add one to the number of bytes considered in the file */
93 db[diclen] = 10; /*set the first and final entry in the database to 10 */
94 db[0] = 10;
95 printf("Dictionary size: %d bytes.\n",diclen);
96
97
98 indlen = (diclen * 3)/4; /*Make a wild guess at the index file length */
99 jindex = (uint32_t *)malloc(indlen); /* and allocate it */
100 if(jindex == NULL)
101 {
102 fprintf(stderr,"malloc() for index table failed.\n");
103 exit(1);
104 }
105
106 printf("Parsing.... \n");
107 /*this is the dictionary parser. It places an entry in jindex for every
108 kana/kanji string and every alphabetic string it finds which is >=3
109 characters */
110 indptr = buildIndex(db,diclen);
111
112 printf("Index entries: %d \nSorting (this is slow)......\n",indptr);
113 jqsort((int32_t)1,indptr);
114
115 printf("Sorted\nWriting index file ....\n");
116 fp = fopen(JDXname,"wb");
117 if (fp==NULL )
118 {
119 printf("\nCannot open %s output file\n",JDXname);
120 exit(1);
121 }
122 jindex[0] = diclen+INDEX_VERSION; /* prepend the index file size + version # */
123 fwrite(jindex,sizeof(int32_t),indptr+1,fp);
124 fclose(fp);
125
126 return 0;
127}
128
129/*=========function to parse the dict file and fill the jindex global with the index====*/
130/*=========returns the size of the index file ====*/
131/*
132 A bit of explanation on what this thing generates is probably in order.
133 Essentially, it fills jindex with a large number of numbers... each number
134 being an offset to a byte location inside of the dictionary file. Starting
135 at position index 1 (second pos)
136 In other words... feeding this thing the dict file
137 "Llama X1\nJT Fred Flintstone X"
138 would generate: {<unmodified>,0,6,12,17}.
139 "X" is skipped because it is only 1 byte long.
140 "JT" is skipped because it is only two bytes long, the J is regular ascii
141 (<127), and the T is not a digit. If any of those were different, (it
142 was longer than 2 bytes, was an euc (kana or kanji) character, or T was
143 a digit) it would be included in the index.
144*/
145
146/*First... an ugly #define to make our code a bit more readable*/
147#define INDEX_OVERFLOW_CHECK(x) {if(x > indlen/sizeof(int32_t)) { \
148 printf("Index table overflow. Dictionary too large?\n"); exit(1); } }
149
150uint32_t buildIndex(unsigned char *dict, uint32_t dictLength) {
151 int nowReadingWord = FALSE; /*Boolean to track if we're mid-word in the dict */
152 int currentDictCharacter; /*Current character index in the dict */
153 unsigned char c; /*the current reading character*/
154 unsigned char currstr[TOKENLIM]; /* String that we're currently getting */
155 int currstrIndex = 0;
156 uint32_t indptr = 1; /* next 'slot' in the index to fill */
157 int saving = FALSE; /*is what we are doing right now slated for salvation?*/
158
159 for (currentDictCharacter =0; currentDictCharacter < dictLength;
160 currentDictCharacter++)
161 {
162 c = dict[currentDictCharacter]; /* Fetch the next character */
163
164 if(!nowReadingWord) /*if we are NOT in the middle of reading a word */
165 {
166 if (alphaoreuc(c) || c == SPTAG) /* if character or priority entry */
167 {
168 nowReadingWord = TRUE; /* Mark that we're mid word */
169 jindex[indptr] = currentDictCharacter;
170 /* copy the location of this character to our index structure */
171 currstrIndex = 1;
172 /*mark the next position in the string to copy a char into */
173 currstr[0] = c;
174 /*set the current string to be equal to this character so far */
175 currstr[1] = '\0';
176 saving = TRUE;
177 }
178 } else { /*If we're in the middle of parsing a word atm */
179
180 /*if it's alphanumeric or - or . copy it and increment where the
181 next one goes */
182 if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c<='9')))
183 {
184 currstr[currstrIndex] = c;
185 if(currstrIndex < TOKENLIM-1)
186 currstrIndex++;
187 }
188 else /* We were reading a word... and we just encountered the
189 end of the word */
190 {
191 currstr[currstrIndex] = '\0'; /*null terminate the string */
192 nowReadingWord = FALSE;
193
194 /*Don't save single or dual character items where the
195 first item is ascii */
196 if ((strlen(currstr) <= 2) && (currstr[0] < 127))
197 saving = FALSE;
198 /*EXCEPT: Save anything that's two character where the second
199 is a number
200 Note that this might catch single 2-byte kanji as well...
201 but it might not*/
202 if ((strlen(currstr) == 2) && (currstr[1] <= '9'))
203 saving = TRUE;
204
205 /* This is a latin-character string, either longer than 2 bytes
206 or having an ascii digit for a second byte */
207 if (saving && (currstr[0] < 127))
208 {
209 indptr++;
210 INDEX_OVERFLOW_CHECK(indptr);
211
212 /* If this is non-Japanese, and has a 'SPTAGn' tag, generate
213 two indices */
214 if ( currstr[0] == SPTAG)
215 {
216 /*make a separate entry pointing to
217 the non-SPTAG'd entry (the next byte)*/
218 jindex[indptr] = jindex[indptr-1]+1;
219 /*overwrite the SPTAG marker*/
220 strcpy(currstr,currstr+1);
221 indptr++;
222 INDEX_OVERFLOW_CHECK(indptr);
223 }
224 }
225
226 /*For strings that start with non latin characters*/
227 if (saving && (currstr[0] > 127))
228 {
229 int i;
230 uint32_t possav = jindex[indptr]; /*Save the current marker*/
231 indptr++;
232 INDEX_OVERFLOW_CHECK(indptr);
233
234 /* generate index for *every* kanji in key */
235 i = 2;
236 /*if this is a three byte kanji, ignore the 0x8f marker */
237 if (currstr[0] == 0x8f)
238 i++;
239 /*step through... two by two*/
240 for ( ; i < strlen(currstr); i+=2)
241 {
242 if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f))
243 {
244 /*Add in a specific reference to the kanji*/
245 jindex[indptr] = possav+i;
246 indptr++;
247 INDEX_OVERFLOW_CHECK(indptr);
248 }
249 /*again the check if it's a three byte kanji*/
250 if(currstr[i] == 0x8f)
251 i++;
252 }
253 }
254 }
255 }
256 }
257 indptr--; /*correct for the overshoot */
258 return indptr;
259}
260
261/*===function to read the dictionary files into array, returning filesize===*/
262/*Note: We leave a blank byte in the first byte of the returned dictionary, and
263 allocate an extra 99 bytes at the end */
264unsigned char*
265readDictionary(const char* dictName,uint32_t *filesize) {
266 FILE *fp;
267 struct stat buf;
268 unsigned char *memDictionary;
269 int nodread;
270
271 if(stat(dictName, &buf) != 0) /* if the dict file doesn't exist */
272 {
273 perror(NULL);
274 printf("Cannot stat: %s \n",dictName);
275 exit(1);
276 }
277
278 *filesize = buf.st_size; /*file size in bytes*/
279
280 puts ("\nLoading Dictionary file. Please wait.....\n");
281 fp=fopen(dictName,"rb");
282 if (fp==NULL )
283 {
284 printf("\nCannot open dictionary file\n");
285 exit(1);
286 }
287 /*Allocate the database index 100 bytes larger than the dict filesize*/
288 memDictionary=(unsigned char*)malloc((*filesize+100)*sizeof(unsigned char));
289 if(memDictionary == NULL)
290 {
291 fprintf(stderr,"malloc() for dictionary failed.\n");
292 fclose(fp);
293 exit(1);
294 }
295
296 nodread = (*filesize)/1024; /*number of kilobytes in the file */
297 /*reads 1024 x nodread bytes from fp, storing in memDictionary at offset 1*/
298 fread((unsigned char *)memDictionary+1, 1024, nodread, fp);
299 nodread = (*filesize) % 1024; /* "leftover" bytes after the previous read */
300 /*reads the remaining bytes from fp... for what filesystem is this split-read needed?*/
301 fread((unsigned char *)(memDictionary+((*filesize)/1024)*1024)+1, nodread,1, fp);
302 fclose(fp);
303
304 return memDictionary;
305}
306
307/*======function to sort jindex table====================*/
308/*see the index generator for information about what jindex contains
309 This simply sorts that output according to the data in the dictionary*/
310void jqsort(int32_t lhs, int32_t rhs)
311{
312 int32_t i,last,midp;
313 uint32_t temp;
314
315 if (lhs >= rhs) return;
316
317 midp = (lhs+rhs)/2; /* calculate the midpoint */
318
319 /*Swap (midp,lhs) */
320 temp = jindex[lhs];
321 jindex[lhs] = jindex[midp];
322 jindex[midp] = temp;
323
324 last = lhs;
325 for (i = lhs+1;i <= rhs; i++)
326 {
327 if (Kstrcmp(jindex[i],jindex[lhs]) < 0)
328 {
329 /* Swap(++last,i);*/
330 last++;
331 temp = jindex[i];
332 jindex[i] = jindex[last];
333 jindex[last] = temp;
334 }
335 }
336
337/* Swap (lhs,last);*/
338 temp = jindex[lhs];
339 jindex[lhs] = jindex[last];
340 jindex[last] = temp;
341
342 jqsort(lhs,last-1);
343 jqsort(last+1,rhs);
344}
345
346/*=====string comparison used by jqsort==========================*/
347int Kstrcmp(uint32_t lhs, uint32_t rhs)
348{
349 int i,c1 = 0, c2 = 0;
350/* effectively does a strnicmp on two "strings" within the dictionary,
351 except it will make katakana and hirgana match (EUC A4 & A5) */
352
353 for (i = 0; i<20 ; i++) /*Compare up to 20 chars*/
354 {
355 c1 = db[lhs+i];
356 c2 = db[rhs+i];
357
358 if ((i % 2) == 0) /*If we're reading the first byte*/
359 {
360 if (c1 == 0xA5) /*Change hiragana to katakana for */
361 c1 = 0xA4; /*The purposes of this comparison */
362 if (c2 == 0xA5)
363 c2 = 0xA4;
364 }
365
366 /*If this is ascii, remove the difference between capitals and small*/
367 if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20;
368 if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20;
369
370 if (c1 != c2 ) break;
371 }
372 return(c1-c2);
373}
374
375/*=======function to test a character for alpha or kana/kanji====*/
376int alphaoreuc(unsigned char x)
377{
378 int c;
379
380 c = x & 0xff;
381 if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)))
382 /*ASCII alphabet*/
383 {
384 return (TRUE);
385 }
386 if ((c >= '0') && (c <= '9'))
387 /*digits*/
388 {
389 return(TRUE);
390 }
391 if ((c & 0x80) > 0)
392 /*EUC kanji/kana*/
393 {
394 return(TRUE);
395 }
396 return (FALSE);
397}
398
399