Skip to content

Commit c4435aa

Browse files
More or less working annotate_text. Should split on spaces...
1 parent aa190d2 commit c4435aa

File tree

6 files changed

+60
-22
lines changed

6 files changed

+60
-22
lines changed

annotate_text.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
input_text = """Segneur ,%% sachies que Mil et .C. .iiijxx. et .xvii. ans apries l'incarnation ihesucrist ,%.%. Au tans Innocent ,%% l'apostole de Rome ,%,%. et Phelippon ,%% roi de france ,%%
2+
et Richart ,%% roi d'engleterre ,%,%. ot vn saint home en france ,%,%. qui ot nom Fouques de Nuelly (%.%. Cil Nuellys siet entre Nuelly sour Marne %,%. et paris )%.%. Et il estoit priessres et tenoit la perroche de la uille .%.%. Et ichil Fouques dont ie vos di
3+
commencha a parle de diu %,%. par france %,%.
4+
et par les autres pais entour ;%.%.
5+
Et sachies que nostre sires fist maintes bieles miracles pour lui .%,%. et tant que la renommee de cel saint home ala ,%% tant qu'ele vint a l'apostole de Rome Innocent ;%.%. Et l'apostoles manda en france au saint home %,%. que il preechast des crois par s'auctorite ;%.%. Et apres i enuoia .i. sien cardonnal ,%% Maistre Pieron de Capes ,%% croisie ,%,%. et manda par lui le pardon tel con ie vous dirai :%.%.
6+
Tout chil qui se croiseroient %,%. et feroient le sieruice diu .i. an en l'ost %,%[punctelev]
7+
seroient quite de toz lor pechies quil auoient fais ,%% dont il seroient confies .%.%.
8+
Pour che que chius pardons fu si grans ,%,%. si s'en esmurent moult li cuer des gens ,%,%. et moult s'en croisierent
9+
pour chou que li pardons estoit si grans .§%.§%.
10+
11+
12+
EN l'autre an apries que chil preudom Fouques parla de diu ,%,%[punctelev] ot .i. tournoi en champaigne ,%%
13+
a .i. castiel qui a non Aicri .%,%. et par la grace de diu si auint ke Thiebaus ,%% quens de champaigne
14+
et de Brie ,%% prist la crois ,%,%. et li cuens Looys de Bloys %,%. et de chartaing .%.%.
15+
Et che fu a l'entree des Auens .%,%. et chil cuens thiebaus estoit iouenes hom et n'auoit pas plus de .xxij. ans ,%.%. Ne li cuens Looys n'auoit pas plus de .xxvij. ans .%,%. Chil doi conte ierent neueu le roi de france %,%. et cousin germain et neueu le roi d'engleterre %.%. De l'autre part .%% auoec ces .ij. contes se croisierent doi moult haut baron de france ,%.%. Symons de Montfort %,%. et Renaus de Mommirail .%.%. Moult fu grans la renommee par les terres .%,%[punctelev] quant cil doi se croisierent .§%.§%.
16+
17+
18+
EN la terre le conte de champaigne se croisa Gerniers li euesques de Troies ,%,%. et li cuens Gautiers de Braine ,%.%. Joffrois de Joinuile ,%,%.
19+
qui estoit senescaus de la tiere ,%.%.
20+
Robiers ses freres ,%.%. Gautiers de voignori ,%.%. Gautiers de Mombelyart ,%.%.
21+
Eustasces d'escouflans ,%.%. Guis dou plaissie %,%. et ses freres ,%% Henris D'ardillieres ,%.%. Ogiers de saint chienon ,%.%.""".replace(
22+
"%", "").replace("\n", " ").replace(" ", "")
23+
24+
print(input_text)
25+
26+
from boudams.tagger import Seq2SeqTokenizer
27+
import logging
28+
29+
logger = logging.getLogger()
30+
logger.setLevel(logging.DEBUG)
31+
32+
tokenizer = Seq2SeqTokenizer.load("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar", device="cpu")
33+
print("".join(tokenizer.annotate_text(input_text)))

boudams/model/linear.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def predict(self, src, src_len, label_encoder: "LabelEncoder") -> torch.Tensor:
103103
"""
104104
out = self(src, src_len, None, teacher_forcing_ratio=0)
105105
logits = torch.argmax(out, 2)
106-
return label_encoder.reverse_batch(logits, masked=src)
106+
return label_encoder.reverse_batch(logits, masked=src, ignore=(self.pad_idx, self.eos_idx, self.sos_idx))
107107

108108
def gradient(
109109
self,

boudams/tagger.py

+23-18
Original file line numberDiff line numberDiff line change
@@ -231,26 +231,31 @@ def load(cls, fpath="./model.tar", device=DEVICE):
231231

232232
return obj
233233

234-
def annotate(self, texts: List[str]):
234+
def annotate(self, texts: List[str], batch_size=32):
235235
self.model.eval()
236-
for sentence in texts:
237-
238-
# it would be good at some point to keep and use order to batchify this
239-
tensor, sentence_length, _ = self.vocabulary.pad_and_tensorize(
240-
[self.vocabulary.inp_to_numerical(self.vocabulary.prepare(sentence))[0]],
241-
device=self.device,
242-
padding=self.out_max_sentence_length-len(sentence)
243-
)
244-
245-
from .model.base import pprint_2d
246-
#pprint_2d(tensor.t())
247-
#print(sentence_length)
248-
249-
logging.debug("Input Tensor {}".format(tensor.shape))
250-
logging.debug("Input Positions tensor {}".format(sentence_length.shape))
236+
for n in range(0, len(texts), batch_size):
237+
batch = texts[n:n+batch_size]
238+
xs = [
239+
self.vocabulary.inp_to_numerical(self.vocabulary.prepare(s))
240+
for s in batch
241+
]
242+
logging.info("Dealing with batch %s " % (int(n/batch_size)+1))
243+
tensor, sentence_length, order = self.vocabulary.pad_and_tensorize(
244+
[x for x, _ in xs],
245+
device=self.device,
246+
padding=max(list(map(lambda x: x[1], xs)))
247+
)
251248

252-
translation = self.model.predict(
249+
translations = self.model.predict(
253250
tensor, sentence_length, label_encoder=self.vocabulary
254251
)
255252

256-
yield "".join(translation[0])
253+
for index in range(len(batch)):
254+
yield "".join(translations[order.index(index)])
255+
256+
def annotate_text(self, string, batch_size=32):
257+
strings = [
258+
string[n:n+self.out_max_sentence_length-10]
259+
for n in range(0, len(string), self.out_max_sentence_length - 10)
260+
]
261+
yield from self.annotate(strings, batch_size=batch_size)

linear_run.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
logger = logging.getLogger()
99
logger.setLevel(logging.DEBUG)
1010

11-
EPOCHS = 10
11+
EPOCHS = 100
1212
TEST = "seints"
1313
RANDOM = True
1414
DEVICE = "cuda"

load.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
for line in Examples.split("\n")
2222
]
2323

24-
for model in glob.glob("/home/thibault/dev/boudams/models/lstm2019-05-22--09:23:38-0.0001.tar"):
24+
for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar"):
2525
tokenizer = Seq2SeqTokenizer.load(model, device="cpu")
2626
print(tokenizer.model)
2727
treated = tokenizer.annotate([x[0] for x in Examples])

voc-2.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "l", "4": "r", "6": "u", "7": "g", "8": "x", "9": "c", "10": "]", "11": "z", "12": " ", "13": "j", "14": ";", "15": "e", "16": "f", "17": "a", "18": "[", "19": "k", "20": "h", "21": "?", "22": "y", "23": "!", "24": "w", "25": "v", "26": "-", "27": ")", "28": "m", "29": "s", "30": "q", "31": "d", "32": "i", "33": "t", "34": "'", "35": ",", "36": ".", "37": "(", "38": "n", "39": "b", "40": "p", "41": ":", "42": "o"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "r": 4, "l": 5, "u": 6, "g": 7, "x": 8, "c": 9, "]": 10, "z": 11, " ": 12, "j": 13, ";": 14, "e": 15, "f": 16, "a": 17, "[": 18, "k": 19, "h": 20, "?": 21, "y": 22, "!": 23, "w": 24, "v": 25, "-": 26, ")": 27, "m": 28, "s": 29, "q": 30, "d": 31, "i": 32, "t": 33, "'": 34, ",": 35, ".": 36, "(": 37, "n": 38, "b": 39, "p": 40, ":": 41, "o": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}
1+
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "r", "4": "!", "6": " ", "7": "q", "8": "t", "9": "p", "10": "v", "11": "x", "12": ";", "13": "l", "14": "n", "15": ".", "16": "s", "17": "g", "18": ")", "19": "o", "20": "z", "21": "e", "22": "k", "23": "(", "24": "d", "25": ":", "26": "a", "27": "?", "28": "u", "29": "b", "30": "j", "31": "i", "32": "'", "33": "y", "34": "f", "35": "m", "36": "w", "37": "c", "38": "]", "39": "[", "40": "h", "41": ",", "42": "-"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "!": 4, "r": 5, " ": 6, "q": 7, "t": 8, "p": 9, "v": 10, "x": 11, ";": 12, "l": 13, "n": 14, ".": 15, "s": 16, "g": 17, ")": 18, "o": 19, "z": 20, "e": 21, "k": 22, "(": 23, "d": 24, ":": 25, "a": 26, "?": 27, "u": 28, "b": 29, "j": 30, "i": 31, "'": 32, "y": 33, "f": 34, "m": 35, "w": 36, "c": 37, "]": 38, "[": 39, "h": 40, ",": 41, "-": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}

0 commit comments

Comments
 (0)