C语言解析星际译王词典
作者: 晋哥哥 发表于 2010-01-07 23:08 原文链接 阅读: 223 评论: 0
扫盲时间:
星际译王,即Star Dict,是利用GTK(GIMP TOOLKIT)开发的国际化的、跨平台的自由的桌面字典软件。它并不包含字典档,使用者须自行下载配合使用。它可以运行于多种不同的平台,如Linux,Microsoft Windows ,FreeBSD及Solaris,并使用GPL授权。
星际译王项目:http://stardict.sourceforge.net/
星际译王词典:
下载:http://stardict.sourceforge.net/Dictionaries.php
好,言归正传:
有个学长用c+python写了个web版的在线词典,c做词典服务器,python做cgi服务器,然后通过FIFO通信,词典用的是星际译王格式的词典,很有意思。本人模仿学长思路,仅为唤醒内心深处对纯c的美好回忆-_-,当然是在业余写作,不过由于工作中需要学的东西渐渐多了一些,就少有些时间继续做了。现在先公布一些代码给大家分享,免得以后忘光光-_-
最关键的就是对星际译王词典文件的解析,对于其格式,读者可以自己安装一个,在其安装目录里有个文件专门介绍了词典格式,英文的。
我简单说一下:一个词典分为三个文件,ifo:描述词典信息;idx:存放了单词的索引位置,格式为:单词+偏移值+长度;dict:可以被压缩,只存放解释。
思路:根据输入的单词,在idx中查找其在dict文件中的偏移值和长度,直接取出,很简单。
以下代码使用emacs NT在winXP+MinGW环境下编译通过:
词典信息头文件(dict_info.h):
Code highlighting produced by Actipro CodeHighlighter (freeware)
http://www.CodeHighlighter.com/
> 1 /*
2 * dict_info.h
3 * Author:shoru
4 * 2009-08-23 12:53
5 */
6
7 #ifndef _DICT_IFO_H
8 #define _DICT_IFO_H
9
10 /*
11 * 测试开关
12 */
13 #define DEBUG
14
15 /*
16 * 行缓冲区大小
17 */
18 #define BUFFER_SIZE 500
19
20 /*
21 * ifo文件后缀
22 */
23 #define IFO_EXT ".ifo"
24
25 /*
26 * dict info file struct.
27 */
28 typedef struct
29 {
30 char version[ 100 ]; // 版本
31 int wordcount; // 单词数量
32 int idxfilesize; // 索引文件大小
33 char bookname[ 100 ]; // 词典名称
34 char sametypesequence[ 10 ];
35 char other_info[ 1000 ]; // 其他不关心的信息
36 } DICT_INFO;
37
38 /*
39 * 解析词典,返回一个DICT_INFO结构体指针
40 */
41 DICT_INFO * get_dict_info( char * file);
42
43 /*
44 * 解析每行
45 */
46 static void parse_line( char * line, DICT_INFO * dict_info);
47
48 #endif /* _DICT_IFO_H */
词典信息源码(dict_info.c):
Code highlighting produced by Actipro CodeHighlighter (freeware)
http://www.CodeHighlighter.com/
> 1 /*
2 * dict_info.c
3 * Author:shoru
4 * 2009-08-23 12:54
5 */
6
7 #include < stdio.h >
8 #include < stdlib.h >
9 #include < string .h >
10 #include < errno.h >
11 #include " dict_info.h "
12
13 /*
14 * 将词典的信息文件装入结构体,并返回该结构体指针
15 * 失败返回NULL
16 */
17 DICT_INFO * get_dict_info( char * info_file)
18 {
19 FILE * ifo;
20 char * line;
21 char buffer[BUFFER_SIZE];
22
23 DICT_INFO * dict_info = (DICT_INFO * )malloc( sizeof (DICT_INFO));
24
25 ifo = fopen(info_file, " r " );
26 if (ifo == NULL)
27 {
28 fprintf(stderr, " %s " ,strerror(errno));
29 return NULL;
30 }
31
32 while ((line = fgets(buffer,BUFFER_SIZE,ifo)) != NULL)
33 {
34 parse_line(line,dict_info);
35 }
36 fclose(ifo);
37
38 return dict_info;
39 }
40
41 /*
42 * 逐行解析文件,将信息装入特定字段
43 */
44 static void parse_line( char * line,DICT_INFO * dict_info)
45 {
46 char * idx;
47
48 if ((idx = strchr(line, ' = ' )) != NULL)
49 {
50 if (strstr(line, " version " ) != NULL)
51 {
52 strcpy(dict_info -> version,idx + 1 );
53 } else if (strstr(line, " wordcount " ) != NULL)
54 {
55 dict_info -> wordcount = atoi(idx + 1 );
56 } else if (strstr(line, " idxfilesize " ) != NULL)
57 {
58 dict_info -> idxfilesize = atoi(idx + 1 );
59 } else if (strstr(line, " bookname " ) != NULL)
60 {
61 strcpy(dict_info -> bookname,idx + 1 );
62 } else if (strstr(line, " sametypesequence " ) != NULL)
63 {
64 strcpy(dict_info -> sametypesequence,idx + 1 );
65 } else {
66 strcat(dict_info -> other_info,line);
67 }
68 }
69 }
70
71
72 #ifdef DEBUG
73
74 int main( int argc, char ** argv)
75 {
76 DICT_INFO * tmp = get_dict_info( " ../dict/oxford-gb/oxford-gb-formated.ifo " );
77 if (tmp == NULL)
78 {
79 printf( " error\n " );
80 exit(EXIT_FAILURE);
81 } else {
82
83 }
84 printf( " version:%s " ,tmp -> version);
85 printf( " bookname:%s " ,tmp -> bookname);
86 printf( " wordcount:%d\n " ,tmp -> wordcount);
87 printf( " idxfilesize:%d\n " ,tmp -> idxfilesize);
88 printf( " sts:%s\n " ,tmp -> sametypesequence);
89 printf( " %s " ,tmp -> other_info);
90 free(tmp);
91 return EXIT_SUCCESS;
92 }
93 #endif /* DEBUG */
词典索引头文件(dict_idx.h):
Code highlighting produced by Actipro CodeHighlighter (freeware)
http://www.CodeHighlighter.com/
> 1 /*
2 * dict_idx.h
3 * Author:shoru
4 * 2009-09-09 12:27
5 */
6
7 #ifndef _DICT_IDX_H
8 #define _DICT_IDX_H
9
10 #include " dict_info.h "
11 /*
12 * 测试开关
13 */
14 #define DEBUG
15
16 #define TRUE 1
17 /*
18 * idx文件后缀
19 */
20 #define IDX_EXT "idx"
21
22 /*
23 * Struct to describe the idx file.
24 */
25 typedef struct
26 {
27 char word[ 100 ];
28 int offset;
29 int length;
30 } WORD_IDX;
31
32 /*
33 * Get a OFF_LEN struct of a word.
34 */
35 static void * get_words( char * filename, DICT_INFO * dict_info, WORD_IDX * word_idx);
36
37 /*
38 * Binary search for word's idx information.
39 */
40 WORD_IDX * get_idx( char * word,WORD_IDX * word_idx, DICT_INFO * dict_info0);
41 inline static int to_int(unsigned char * from_int);
42 #endif /* _DICT_IDX_H */
词典索引源码(dict_idx.c):
Code highlighting produced by Actipro CodeHighlighter (freeware)
http://www.CodeHighlighter.com/
> 1 /*
2 * dict_idx.c
3 * Author:shoru
4 * 2009-09-09 12:27
5 */
6
7 #include < stdlib.h >
8 #include < stdio.h >
9 #include < string .h >
10 #include " dict_idx.h "
11 #include " dict_info.h "
12
13 static void * get_words( char * filename, DICT_INFO * dict_info, WORD_IDX * word_idx)
14 {
15 FILE * fd = fopen(filename, " rb " );
16 size_t nread = 0 ;
17
18 if (fd == NULL || dict_info == NULL)
19 {
20 return NULL;
21 }
22 unsigned char buffer[dict_info -> idxfilesize];
23
24 nread = fread(buffer,dict_info -> idxfilesize, 1 ,fd);
25
26 unsigned char * head, * tail;
27 head = tail = buffer;
28 int it = 0 ;
29 int total = 1 ;
30 for (; it < dict_info -> idxfilesize; it ++ )
31 {
32 if ( * head == ' \0 ' )
33 {
34 strncpy((word_idx + total) -> word,tail,head - tail + 1 );
35 (word_idx + total) -> offset = to_int(head + 1 );
36 (word_idx + total) -> length = to_int(head + 5 );
37 total ++ ;
38 head += 9 ;
39 tail = head;
40 if (total == dict_info -> wordcount) break ;
41 } else {
42 head ++ ;
43 continue ;
44 }
45 }
46 }
47
48 inline static int to_int(unsigned char * from_int)
49 {
50 return * (from_int + 3 ) + ( * (from_int + 2 ) << 8 ) + ( * (from_int + 1 ) << 16 ) + ( * from_int << 24 );
51 }
52
53 WORD_IDX * get_idx( char * word,WORD_IDX * word_idx, DICT_INFO * dict_info)
54 {
55 if (word == NULL || word_idx == NULL || dict_info == NULL)
56 {
57 return NULL;
58 }
59 int head = 0 ,tail = dict_info -> wordcount,cur = tail / 2 ;
60
61 int i = 0 ;
62
63 while (TRUE)
64 {
65 int cmp = strcasecmp(word,word_idx[cur].word);
66 if ( 0 == cmp)
67 {
68 return & word_idx[cur];
69 } else if ( 0 > cmp){
70 tail = cur;
71 } else {
72 head = cur;
73 }
74 cur = (tail + head) / 2 ;
75 }
76 }
77
78
79
80
81 #ifdef DEBUG
82
83 int main( int argc, char ** argv)
84 {
85 char * filename = " ../dict/oxford-gb/oxford-gb-formated.idx " ;
86 char * dictname = " ../dict/oxford-gb/oxford-gb-formated.dict " ;
87
88 DICT_INFO dict_info;
89 dict_info.wordcount = 39429 ;
90 dict_info.idxfilesize = 721264 ;
91 WORD_IDX * idx = (WORD_IDX * )malloc( sizeof (WORD_IDX) * dict_info.wordcount);
92 get_words(filename, & dict_info,idx);
93
94 WORD_IDX * word = get_idx( " a " ,idx, & dict_info);
95
96 printf( " %s,%d,%d\n " ,word -> word,word -> offset,word -> length);
97
98 FILE * dict = fopen(dictname, " r " );
99 if (dict == NULL)
100 {
101 printf( " dict error\n " );
102 return - 1 ;
103 }
104 if ( 0 != fseek(dict,word -> offset,SEEK_SET)){
105 printf( " seek error\n " );
106 return - 1 ;
107 }
108
109 char explain[word -> length + 1 ];
110 memset(explain, ' \0 ' ,word -> length + 1 );
111 fread(explain,word -> length, 1 ,dict);
112
113 printf( " %s\n " ,explain);
114 free(idx);
115 return EXIT_SUCCESS;
116 }
117
118 #endif /* DEBUG */
转载于:https://my.oschina.net/shoru/blog/6200