61 #include "ngram_model_internal.h"
68 ext = strrchr(file_name,
'.');
73 while (--ext >= file_name) {
74 if (*ext ==
'.')
break;
76 if (ext < file_name) {
81 while (--ext >= file_name) {
82 if (*ext ==
'.')
break;
84 if (ext < file_name) {
122 const char *file_name,
130 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
132 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
137 model = ngram_model_arpa_read(config, file_name, lmath);
140 model = ngram_model_dmp_read(config, file_name, lmath);
143 E_ERROR(
"language model file type not supported\n");
154 lw = cmd_ln_float32_r(config,
"-lw");
156 wip = cmd_ln_float32_r(config,
"-wip");
158 uw = cmd_ln_float32_r(config,
"-uw");
179 return ngram_model_arpa_write(model, file_name);
181 return ngram_model_dmp_write(model, file_name);
183 E_ERROR(
"language model file type not supported\n");
186 E_ERROR(
"language model file type not supported\n");
194 int32 n, int32 n_unigram)
203 if (base->
lmath != lmath) {
218 for (i = 0; i < base->
n_words; ++i) {
266 for (i = 0; i < model->
n_words; ++i) {
277 for (j = 0; j < lmclass->
n_words; ++j) {
280 for (j = 0; j < lmclass->
n_hash; ++j) {
281 if (lmclass->nword_hash[j].
wid != -1) {
288 ngram_class_free(model->
classes[i]);
312 for (i = 0; i < model->
n_words; ++i) {
321 if (outstr[0] ==
'<' || outstr[0] ==
'[') {
340 E_WARN(
"Duplicate word in dictionary after conversion: %s\n",
346 model->
wid = new_wid;
362 if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
374 for (i = 0; i < model->
n_words; ++i) {
375 if (strlen(model->
word_str[i]) > maxlen)
376 maxlen = strlen(model->
word_str[i]);
383 maxlen = maxlen *
sizeof(int) + 15;
388 for (i = 0; i < model->
n_words; ++i) {
389 ICONV_CONST
char *in;
391 size_t inleft, outleft, result;
394 in = (ICONV_CONST
char *)model->
word_str[i];
400 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
401 if (errno != E2BIG) {
410 iconv(ic, NULL, NULL, NULL, NULL);
415 in = (ICONV_CONST
char *)model->
word_str[i];
420 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
421 if (errno != E2BIG) {
430 iconv(ic, NULL, NULL, NULL, NULL);
435 goto start_conversion;
438 result = maxlen - outleft;
450 memcpy(model->
word_str[i], outbuf, result);
456 E_WARN(
"Duplicate word in dictionary after conversion: %s\n",
464 model->
wid = new_wid;
478 float32 lw, float32 wip, float32 uw)
487 if (out_log_wip) *out_log_wip = model->
log_wip;
488 if (out_log_uw) *out_log_uw = model->
log_uw;
495 int32 n_hist, int32 *n_used)
497 int32 score, class_weight = 0;
505 if (NGRAM_IS_CLASSWID(wid)) {
508 class_weight = ngram_class_prob(lmclass, wid);
509 if (class_weight == 1)
513 for (i = 0; i < n_hist; ++i) {
517 score = (*model->
funcs->
score)(model, wid, history, n_hist, n_used);
520 return score + class_weight;
533 va_start(history, word);
535 while ((hword = va_arg(history,
const char *)) != NULL)
540 va_start(history, word);
542 while ((hword = va_arg(history,
const char *)) != NULL) {
543 histid[n_hist] =
ngram_wid(model, hword);
549 histid, n_hist, &n_used);
571 int32 n_hist, int32 *n_used)
573 int32 prob, class_weight = 0;
581 if (NGRAM_IS_CLASSWID(wid)) {
584 class_weight = ngram_class_prob(lmclass, wid);
585 if (class_weight == 1)
589 for (i = 0; i < n_hist; ++i) {
596 return prob + class_weight;
609 va_start(history, word);
611 while ((hword = va_arg(history,
const char *)) != NULL)
616 va_start(history, word);
618 while ((hword = va_arg(history,
const char *)) != NULL) {
619 histid[n_hist] =
ngram_wid(model, hword);
625 histid, n_hist, &n_used);
638 prob = (int32)(prob / base->
lw);
680 int m,
int successor)
710 va_start(history, word);
712 while ((hword = va_arg(history,
const char *)) != NULL)
717 va_start(history, word);
719 while ((hword = va_arg(history,
const char *)) != NULL) {
720 histid[n_hist] =
ngram_wid(model, hword);
733 if (n_hist >= model->
n)
737 return (*model->
funcs->
iter)(model, wid, history, n_hist);
744 if (itor->
m == itor->model->
n - 1)
754 return (*itor->model->
funcs->
iter_get)(itor, out_score, out_bowt);
785 wid = NGRAM_BASEWID(wid);
805 wid = NGRAM_CLASSWID(wid, classid);
809 E_ERROR(
"Duplicate definition of word %s\n", word);
823 E_ERROR(
"Hash insertion failed for word %s => %p (should not happen)\n",
833 const char *word, float32 weight)
839 E_WARN(
"Can't add word '%s' to read-only language model. "
840 "Disable mmap with '-mmap no' to make it writable\n", word);
844 wid = ngram_add_word_internal(model, word, -1);
871 lmclass->nword_hash = NULL;
874 for (gn = classwords; gn; gn = gnode_next(gn)) {
875 tprob += gnode_float32(gn);
877 if (tprob > 1.1 || tprob < 0.9) {
878 E_WARN(
"Total class probability is %f, will normalize\n", tprob);
879 for (gn = classwords; gn; gn = gnode_next(gn)) {
880 gn->data.fl /= tprob;
883 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
891 ngram_class_add_word(
ngram_class_t *lmclass, int32 wid, int32 lweight)
895 if (lmclass->nword_hash == NULL) {
897 lmclass->nword_hash =
ckd_malloc(NGRAM_HASH_SIZE *
sizeof(*lmclass->nword_hash));
898 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE *
sizeof(*lmclass->nword_hash));
899 lmclass->
n_hash = NGRAM_HASH_SIZE;
905 hash = wid & (lmclass->
n_hash - 1);
906 if (lmclass->nword_hash[hash].
wid == -1) {
908 lmclass->nword_hash[hash].
wid = wid;
909 lmclass->nword_hash[hash].
prob1 = lweight;
916 while (lmclass->nword_hash[hash].
next != -1)
917 hash = lmclass->nword_hash[hash].
next;
922 lmclass->nword_hash =
ckd_realloc(lmclass->nword_hash,
923 lmclass->
n_hash * 2 *
sizeof(*lmclass->nword_hash));
924 memset(lmclass->nword_hash + lmclass->
n_hash,
925 0xff, lmclass->
n_hash *
sizeof(*lmclass->nword_hash));
932 for (next = 0; next < lmclass->
n_hash; ++next)
933 if (lmclass->nword_hash[next].
wid == -1)
936 assert(next != lmclass->
n_hash);
938 lmclass->nword_hash[next].
wid = wid;
939 lmclass->nword_hash[next].
prob1 = lweight;
940 lmclass->nword_hash[hash].
next = next;
956 const char *classname,
961 int32 classid, tag_wid, wid, i, scale;
969 E_ERROR(
"No such word or class tag: %s\n", classname);
972 for (classid = 0; classid < model->
n_classes; ++classid) {
978 E_ERROR(
"Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
981 lmclass = model->
classes[classid];
984 wid = ngram_add_word_internal(model, word, classid);
994 for (i = 0; i < lmclass->
n_words; ++i)
995 lmclass->
prob1[i] += scale;
996 for (i = 0; i < lmclass->
n_hash; ++i)
997 if (lmclass->nword_hash[i].
wid != -1)
998 lmclass->nword_hash[i].
prob1 += scale;
1001 return ngram_class_add_word(lmclass, wid,
logmath_log(model->
lmath, fprob));
1006 const char *classname,
1007 float32 classweight,
1009 const float32 *weights,
1014 int32 i, start_wid = -1;
1015 int32 classid, tag_wid;
1025 E_ERROR(
"Number of classes cannot exceed 128 (sorry)\n");
1029 for (i = 0; i < n_words; ++i) {
1032 wid = ngram_add_word_internal(model, words[i], classid);
1035 if (start_wid == -1)
1036 start_wid = NGRAM_BASEWID(wid);
1040 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
1042 if (lmclass == NULL)
1051 model->
classes[classid] = lmclass;
1058 int32 base_wid = NGRAM_BASEWID(wid);
1060 if (base_wid < lmclass->start_wid
1065 hash = wid & (lmclass->
n_hash - 1);
1066 while (hash != -1 && lmclass->nword_hash[hash].
wid != wid)
1067 hash = lmclass->nword_hash[hash].
next;
1070 return lmclass->nword_hash[hash].
prob1;
1078 read_classdef_file(
hash_table_t *classes,
const char *file_name)
1087 char *classname = NULL;
1089 if ((fp =
fopen_comp(file_name,
"r", &is_pipe)) == NULL) {
1090 E_ERROR(
"File %s not found\n", file_name);
1100 if (fgets(line,
sizeof(line), fp) == NULL)
1109 if (n_words == 2 && 0 == strcmp(wptr[0],
"END")) {
1114 if (classname == NULL || 0 != strcmp(wptr[1], classname))
1123 classdef->words =
ckd_calloc(classdef->n_words,
1124 sizeof(*classdef->words));
1125 classdef->weights =
ckd_calloc(classdef->n_words,
1126 sizeof(*classdef->weights));
1128 weight = classprobs;
1129 for (i = 0; i < classdef->n_words; ++i) {
1131 classdef->weights[i] = gnode_float32(weight);
1132 word = gnode_next(word);
1133 weight = gnode_next(weight);
1138 classdef_free(classdef);
1153 fprob = (float32)
atof_c(wptr[1]);
1163 if (n_words == 2 && 0 == strcmp(wptr[0],
"LMCLASS")) {
1177 for (gn = classwords; gn; gn = gnode_next(gn))
1190 for (i = 0; i < classdef->n_words; ++i)
1200 const char *file_name)
1208 if (read_classdef_file(classes, file_name) < 0) {
1215 for (gn = hl; gn; gn = gnode_next(gn)) {
1222 classdef->n_words) < 0)
1228 for (gn = hl; gn; gn = gnode_next(gn)) {
1231 classdef_free(he->
val);