-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathIBM1_EM.py
113 lines (86 loc) · 4.26 KB
/
IBM1_EM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import math
import Utils
# expectation maximization
def expect_max(pfil_word_dict, pen_word_dict, plst_fil_sen, plst_en_sen):
fil_occur = len(pfil_word_dict)
en_occur = len(pen_word_dict)
# IBM1 Expectaion Maximization algorithm
trans_en_fil_matrix = np.full((len(pfil_word_dict), len(pen_word_dict)), 1 / len(pen_word_dict), dtype=float)
trans_en_fil_matrix_prev = np.full((len(pfil_word_dict), len(pen_word_dict)), 1, dtype=float)
int_count = 0
while not Utils.is_converged(trans_en_fil_matrix, trans_en_fil_matrix_prev, int_count):
int_count += 1
# making the current matrix as the old one
trans_en_fil_matrix_prev = trans_en_fil_matrix.copy()
# initializing the enfil's value as 0
total_enfil = np.full((len(pfil_word_dict), len(pen_word_dict)), 0, dtype=float)
# initializing the final total value
total_fin = np.full((len(pen_word_dict)),0, dtype=float)
for int_index, lst_fil_sen in enumerate(plst_fil_sen): # for all sentence pairs (e,f) do
# computing for the normalization
lst_fil_words = lst_fil_sen.split(" ")
total_sen = np.full((len(lst_fil_words)), 0, dtype=float)
# for all words in the filipino list of words
for int_index2 in range(len(lst_fil_words)):
str_fil_word = lst_fil_words[int_index2]
total_sen[int_index2] = 0
lst_en_words = plst_en_sen[int_index].split(" ")
# for all string words in the list of words
for str_en_word in lst_en_words:
# continue even if the string is empty
if str_en_word == '':
continue
int_index_fildict = pfil_word_dict[str_fil_word]
int_index_endict = pen_word_dict[str_en_word]
total_sen[int_index2] += trans_en_fil_matrix[int_index_fildict][int_index_endict]
#end for
#end for
#collect counts
lst_fil_words = lst_fil_sen.split(" ")
for int_index2 in range(len(lst_fil_words)): #for all words e in e do
str_fil_word = lst_fil_words[int_index2]
lst_en_words = plst_en_sen[int_index].split(" ")
for str_en_word in lst_en_words: #for all words f in f do
if str_en_word == '' :
continue
int_index_fildict = pfil_word_dict[str_fil_word]
int_index_endict = pen_word_dict[str_en_word]
total_enfil[int_index_fildict][int_index_endict] += trans_en_fil_matrix[int_index_fildict][int_index_endict] / total_sen[int_index2]
total_fin[int_index_endict] += trans_en_fil_matrix[int_index_fildict][int_index_endict] / total_sen[int_index2]
#end for
#end for
#end for
#estimate probabilities
for int_en_index in range(en_occur): # for all foreign words f do
for int_fil_index in range(fil_occur): # for all English words e do
if total_enfil[int_fil_index][int_en_index] != 0 :
trans_en_fil_matrix[int_fil_index][int_en_index] = total_enfil[int_fil_index][int_en_index] / total_fin[int_en_index]
#end for
#end for
#end while
print("EM Algorithm Converged in ",(int_count-1)," iterations")
return trans_en_fil_matrix
def get_translation_prob(e, f, t, e_dict, f_dict):
const = Utils.const
l_e = len(e)
l_f = len(f)
res = const / math.pow((l_f+1),l_e)
for j in range(l_e):
e_word = e[j]
if e_word in e_dict:
e_j = e_dict[e_word]
else:
print("word '"+ e_word +"' is not found in target language dictionary")
continue
#return 0
sum = 0
for i in range(l_f):
f_word = f[i]
if f_word in f_dict:
f_i = f_dict[f_word]
sum += t[e_j][f_i]
else:
print("word '" + f_word +"' is not found in source language dictionary")
res *= sum
return res