xjdxgen.c [kdeedu/kiten/xjdxgen.c]

1	/**************************************************************************
2	* X J D X G E N
3	* Author: Jim Breen
4	* Index (.xjdx) generator program fron XJDIC
5	*
6	* V2.3 - indexes JIS X 0212 (3-byte EUC) kanji
7	* Copyright 1998 Jim Breen <jwb@csse.monash.edu.au>
8	***************************************************************************/
9	/ This program is free software; you can redistribute it and/or modify*
10	it under the terms of the GNU General Public License as published by
11	the Free Software Foundation; either version 1, or (at your option)
12	any later version.
13
14	This program is distributed in the hope that it will be useful,
15	but WITHOUT ANY WARRANTY; without even the implied warranty of
16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17	GNU General Public License for more details.
18
19	You should have received a copy of the GNU General Public License
20	along with this program; if not, write to the Free Software
21	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. /*
22
23	/ Changed: ignore all rc stuff. use args 1 and 2 for input/output file.*
24	-- jason /*
25
26	/ Heavily commented, removed the unused header file, split off the*
27	readDictionary function, removed unused functions and variables... cleaned
28	up the code in general. Preparing for integration to the rest of the program
29
30	Note that this indexer has been hacked off of Jim Breen's xjdic program,
31	and a lot of the things which have been removed were relevant to that
32	program, but not to this one.
33	--Joe
34	*/
35
36	#include <config-kiten.h>
37	#include <sys/stat.h>
38	#include <unistd.h>
39	#include <stdio.h>
40	#include <stdlib.h>
41	#include <ctype.h>
42	#include <string.h>
43
44	#ifdef HAVE_STDINT_H
45	#include <stdint.h>
46	#endif
47	#ifdef HAVE_INTTYPES_H
48	#include <inttypes.h>
49	#endif
50
51	#define TRUE 1
52	#define FALSE 0
53	#define SPTAG '@'
54	#define TOKENLIM 40
55	#define INDEX_VERSION 14; /The last time the index structure changed was Version1.4/
56
57	unsigned char *db;
58	uint32_t *jindex;
59	uint32_t indlen;
60
61	/====== prototypes=================================================/
62	void jqsort(int32_t i, int32_t j);
63	int Kstrcmp(uint32_t lhs, uint32_t rhs);
64	int alphaoreuc(unsigned char x);
65	unsigned char* readDictionary(const char* dictName,uint32_t *filesize);
66	uint32_t buildIndex(unsigned char* dict, uint32_t dictLength);
67
68	/====function to Load Dictionary and load/create index table=======/
69	int main(int argc, char **argv)
70	{
71	const char *Dname;
72	const char *JDXname;
73	FILE *fp;
74	uint32_t diclen;
75	uint32_t indptr;
76
77	printf("\nNOTE: running this program by itself is never necessary. Kiten will run it automatically.\n");
78	printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n");
79
80	if (argc < `3`)
81	{
82	printf("\nUSAGE: kitengen input output.xjdx\n");
83	exit(`2`);
84	}
85
86	Dname = argv[`1`]; /Name of the dictionary being scanned /
87	JDXname = argv[`2`]; /Name of the output file /
88	printf("Commandline request to use files %s and %s \n", Dname, JDXname);
89	printf("\nWARNING!! This program may take a long time to run .....\n");
90
91	db = readDictionary(Dname,&diclen); /Reads the dict, but leaves a space at the beginning/
92	diclen++; /add one to the number of bytes considered in the file /
93	db[diclen] = `10`; /set the first and final entry in the database to 10 /
94	db[`0`] = `10`;
95	printf("Dictionary size: %d bytes.\n",diclen);
96
97
98	indlen = (diclen * `3`)/`4`; /Make a wild guess at the index file length /
99	jindex = (uint32_t )malloc(indlen); /* and allocate it /
100	if(jindex == NULL)
101	{
102	fprintf(stderr,"malloc() for index table failed.\n");
103	exit(`1`);
104	}
105
106	printf("Parsing.... \n");
107	/this is the dictionary parser. It places an entry in jindex for every*
108	kana/kanji string and every alphabetic string it finds which is >=3
109	characters /*
110	indptr = buildIndex(db,diclen);
111
112	printf("Index entries: %d \nSorting (this is slow)......\n",indptr);
113	jqsort((int32_t)`1`,indptr);
114
115	printf("Sorted\nWriting index file ....\n");
116	fp = fopen(JDXname,"wb");
117	if (fp==NULL )
118	{
119	printf("\nCannot open %s output file\n",JDXname);
120	exit(`1`);
121	}
122	jindex[`0`] = diclen+INDEX_VERSION; / prepend the index file size + version # /
123	fwrite(jindex,sizeof(int32_t),indptr+`1`,fp);
124	fclose(fp);
125
126	return `0`;
127	}
128
129	/=========function to parse the dict file and fill the jindex global with the index====/
130	/=========returns the size of the index file ====/
131	/*
132	A bit of explanation on what this thing generates is probably in order.
133	Essentially, it fills jindex with a large number of numbers... each number
134	being an offset to a byte location inside of the dictionary file. Starting
135	at position index 1 (second pos)
136	In other words... feeding this thing the dict file
137	"Llama X1\nJT Fred Flintstone X"
138	would generate: {<unmodified>,0,6,12,17}.
139	"X" is skipped because it is only 1 byte long.
140	"JT" is skipped because it is only two bytes long, the J is regular ascii
141	(<127), and the T is not a digit. If any of those were different, (it
142	was longer than 2 bytes, was an euc (kana or kanji) character, or T was
143	a digit) it would be included in the index.
144	*/
145
146	/First... an ugly #define to make our code a bit more readable/
147	#define INDEX_OVERFLOW_CHECK(x) {if(x > indlen/sizeof(int32_t)) { \
148	printf("Index table overflow. Dictionary too large?\n"); exit(1); } }
149
150	uint32_t buildIndex(unsigned char *dict, uint32_t dictLength) {
151	int nowReadingWord = FALSE; /Boolean to track if we're mid-word in the dict /
152	int currentDictCharacter; /Current character index in the dict /
153	unsigned char c; /the current reading character/
154	unsigned char currstr[TOKENLIM]; / String that we're currently getting /
155	int currstrIndex = `0`;
156	uint32_t indptr = `1`; / next 'slot' in the index to fill /
157	int saving = FALSE; /is what we are doing right now slated for salvation?/
158
159	for (currentDictCharacter =`0`; currentDictCharacter < dictLength;
160	currentDictCharacter++)
161	{
162	c = dict[currentDictCharacter]; / Fetch the next character /
163
164	if(!nowReadingWord) /if we are NOT in the middle of reading a word /
165	{
166	if (alphaoreuc(c) \|\| c == SPTAG) / if character or priority entry /
167	{
168	nowReadingWord = TRUE; / Mark that we're mid word /
169	jindex[indptr] = currentDictCharacter;
170	/ copy the location of this character to our index structure /
171	currstrIndex = `1`;
172	/mark the next position in the string to copy a char into /
173	currstr[`0`] = c;
174	/set the current string to be equal to this character so far /
175	currstr[`1`] = '\0';
176	saving = TRUE;
177	}
178	} else { /If we're in the middle of parsing a word atm /
179
180	/if it's alphanumeric or - or . copy it and increment where the*
181	next one goes /*
182	if ((alphaoreuc(c))\|\|(c == '-')\|\|(c == '.')\|\|((c >= '0') && (c<='9')))
183	{
184	currstr[currstrIndex] = c;
185	if(currstrIndex < TOKENLIM-`1`)
186	currstrIndex++;
187	}
188	else / We were reading a word... and we just encountered the*
189	end of the word /*
190	{
191	currstr[currstrIndex] = '\0'; /null terminate the string /
192	nowReadingWord = FALSE;
193
194	/Don't save single or dual character items where the*
195	first item is ascii /*
196	if ((strlen(currstr) <= `2`) && (currstr[`0`] < `127`))
197	saving = FALSE;
198	/EXCEPT: Save anything that's two character where the second*
199	is a number
200	Note that this might catch single 2-byte kanji as well...
201	but it might not/*
202	if ((strlen(currstr) == `2`) && (currstr[`1`] <= '9'))
203	saving = TRUE;
204
205	/ This is a latin-character string, either longer than 2 bytes*
206	or having an ascii digit for a second byte /*
207	if (saving && (currstr[`0`] < `127`))
208	{
209	indptr++;
210	INDEX_OVERFLOW_CHECK(indptr);
211
212	/ If this is non-Japanese, and has a 'SPTAGn' tag, generate*
213	two indices /*
214	if ( currstr[`0`] == SPTAG)
215	{
216	/make a separate entry pointing to*
217	the non-SPTAG'd entry (the next byte)/*
218	jindex[indptr] = jindex[indptr-`1`]+`1`;
219	/overwrite the SPTAG marker/
220	strcpy(currstr,currstr+`1`);
221	indptr++;
222	INDEX_OVERFLOW_CHECK(indptr);
223	}
224	}
225
226	/For strings that start with non latin characters/
227	if (saving && (currstr[`0`] > `127`))
228	{
229	int i;
230	uint32_t possav = jindex[indptr]; /Save the current marker/
231	indptr++;
232	INDEX_OVERFLOW_CHECK(indptr);
233
234	/ generate index for every kanji in key /
235	i = `2`;
236	/if this is a three byte kanji, ignore the 0x8f marker /
237	if (currstr[`0`] == `0x8f`)
238	i++;
239	/step through... two by two/
240	for ( ; i < strlen(currstr); i+=`2`)
241	{
242	if((currstr[i] >= `0xb0`) \|\| (currstr[i] == `0x8f`))
243	{
244	/Add in a specific reference to the kanji/
245	jindex[indptr] = possav+i;
246	indptr++;
247	INDEX_OVERFLOW_CHECK(indptr);
248	}
249	/again the check if it's a three byte kanji/
250	if(currstr[i] == `0x8f`)
251	i++;
252	}
253	}
254	}
255	}
256	}
257	indptr--; /correct for the overshoot /
258	return indptr;
259	}
260
261	/===function to read the dictionary files into array, returning filesize===/
262	/Note: We leave a blank byte in the first byte of the returned dictionary, and*
263	allocate an extra 99 bytes at the end /*
264	unsigned char*
265	readDictionary(const char* dictName,uint32_t *filesize) {
266	FILE *fp;
267	struct stat buf;
268	unsigned char *memDictionary;
269	int nodread;
270
271	if(stat(dictName, &buf) != `0`) / if the dict file doesn't exist /
272	{
273	perror(NULL);
274	printf("Cannot stat: %s \n",dictName);
275	exit(`1`);
276	}
277
278	filesize = buf.st_size; /file size in bytes/*
279
280	puts ("\nLoading Dictionary file. Please wait.....\n");
281	fp=fopen(dictName,"rb");
282	if (fp==NULL )
283	{
284	printf("\nCannot open dictionary file\n");
285	exit(`1`);
286	}
287	/Allocate the database index 100 bytes larger than the dict filesize/
288	memDictionary=(unsigned char)malloc((filesize+`100`)*sizeof(unsigned char));
289	if(memDictionary == NULL)
290	{
291	fprintf(stderr,"malloc() for dictionary failed.\n");
292	fclose(fp);
293	exit(`1`);
294	}
295
296	nodread = (filesize)/`1024`; /number of kilobytes in the file /*
297	/reads 1024 x nodread bytes from fp, storing in memDictionary at offset 1/
298	fread((unsigned char *)memDictionary+`1`, `1024`, nodread, fp);
299	nodread = (filesize) % `1024`; /* "leftover" bytes after the previous read /
300	/reads the remaining bytes from fp... for what filesystem is this split-read needed?/
301	fread((unsigned char )(memDictionary+((filesize)/`1024`)*`1024`)+`1`, nodread,`1`, fp);
302	fclose(fp);
303
304	return memDictionary;
305	}
306
307	/======function to sort jindex table====================/
308	/see the index generator for information about what jindex contains*
309	This simply sorts that output according to the data in the dictionary/*
310	void jqsort(int32_t lhs, int32_t rhs)
311	{
312	int32_t i,last,midp;
313	uint32_t temp;
314
315	if (lhs >= rhs) return;
316
317	midp = (lhs+rhs)/`2`; / calculate the midpoint /
318
319	/Swap (midp,lhs) /
320	temp = jindex[lhs];
321	jindex[lhs] = jindex[midp];
322	jindex[midp] = temp;
323
324	last = lhs;
325	for (i = lhs+`1`;i <= rhs; i++)
326	{
327	if (Kstrcmp(jindex[i],jindex[lhs]) < `0`)
328	{
329	/ Swap(++last,i);/
330	last++;
331	temp = jindex[i];
332	jindex[i] = jindex[last];
333	jindex[last] = temp;
334	}
335	}
336
337	/ Swap (lhs,last);/
338	temp = jindex[lhs];
339	jindex[lhs] = jindex[last];
340	jindex[last] = temp;
341
342	jqsort(lhs,last-`1`);
343	jqsort(last+`1`,rhs);
344	}
345
346	/=====string comparison used by jqsort==========================/
347	int Kstrcmp(uint32_t lhs, uint32_t rhs)
348	{
349	int i,c1 = `0`, c2 = `0`;
350	/ effectively does a strnicmp on two "strings" within the dictionary,*
351	except it will make katakana and hirgana match (EUC A4 & A5) /*
352
353	for (i = `0`; i<`20` ; i++) /Compare up to 20 chars/
354	{
355	c1 = db[lhs+i];
356	c2 = db[rhs+i];
357
358	if ((i % `2`) == `0`) /If we're reading the first byte/
359	{
360	if (c1 == `0xA5`) /Change hiragana to katakana for /
361	c1 = `0xA4`; /The purposes of this comparison /
362	if (c2 == `0xA5`)
363	c2 = `0xA4`;
364	}
365
366	/If this is ascii, remove the difference between capitals and small/
367	if ((c1 >= 'A') && (c1 <= 'Z')) c1 \|= `0x20`;
368	if ((c2 >= 'A') && (c2 <= 'Z')) c2 \|= `0x20`;
369
370	if (c1 != c2 ) break;
371	}
372	return(c1-c2);
373	}
374
375	/=======function to test a character for alpha or kana/kanji====/
376	int alphaoreuc(unsigned char x)
377	{
378	int c;
379
380	c = x & `0xff`;
381	if(((c >= `65`) && (c <= `90`)) \|\| ((c >= `97`) && (c <= `122`)))
382	/ASCII alphabet/
383	{
384	return (TRUE);
385	}
386	if ((c >= '0') && (c <= '9'))
387	/digits/
388	{
389	return(TRUE);
390	}
391	if ((c & `0x80`) > `0`)
392	/EUC kanji/kana/
393	{
394	return(TRUE);
395	}
396	return (FALSE);
397	}
398
399