Skip to content

Commit 6ac1755

Browse files
Prevision du cli
1 parent c4435aa commit 6ac1755

File tree

8 files changed

+29
-34
lines changed

8 files changed

+29
-34
lines changed

README.md

+7
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@ space and they are separated by tabs (`\t`, marked here as `<TAB>`).
1111
Things needs a little more tweaks here and there again, I'd like to see how Attention will perform. This model is
1212
particulary built for OCR/HTR output from manuscripts where spaces are inconsistent.
1313

14+
15+
```text
16+
Train Loss: 0.004 | Perplexity: 1.004 | Acc.: 0.566 | Lev.: 0.037 | Lev. / char: 0.001
17+
Val. Loss: 0.066 | Perplexity: 1.069 | Acc.: 0.585 | Lev.: 0.272 | Lev. / char: 0.009
18+
Test Loss: 0.057 | Perplexity: 1.059 | Acc,: 0.586 | Lev.: 0.235 | Lev. / char: 0.008
19+
```
20+
1421
## Examples
1522

1623
### BiDirectional GRU with Attention

boudams/cli.py

Whitespace-only changes.

boudams/dataset/utils.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,10 @@ def check(input_path, max_length=100):
200200

201201

202202
if __name__ == "__main__":
203-
convert("/home/thibault/dev/LiSeinConfessorPandora/data/lemmatises/*.tsv", "/home/thibault/dev/boudams/data/seints")
204-
split("/home/thibault/dev/boudams/data/seints/*")
205-
check("/home/thibault/dev/boudams/data/seints/")
203+
output = "/home/thibault/dev/boudams/data/seints"
204+
output = "/home/thibault/dev/boudams/data/fro"
205+
inp = "/home/thibault/dev/LiSeinConfessorPandora/data/lemmatises/*.tsv"
206+
inp = "/home/thibault/dev/boudams/data/inp/*.tab"
207+
convert(inp, output, dict_reader=True)
208+
split(output + "/*")
209+
check(output+"/")

boudams/encoder.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import random
88
import json
99
import unidecode
10-
10+
from operator import itemgetter
1111

1212
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
1313
DEFAULT_INIT_TOKEN = "<SOS>"
@@ -299,6 +299,7 @@ def pad_and_tensorize(
299299
# Packed sequence need to be in decreasing size order
300300
for current in sequences:
301301
order.append(sentences.index(current))
302+
sentences[order[-1]] = None # We replace this index with nothing in case some segments are equals
302303
tensor.append(current + [self.pad_token_index] * (max_len - len(current)))
303304
lengths.append(len(tensor[-1]) - max(0, max_len - len(current)))
304305

boudams/tagger.py

+10-27
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import torch
22
import torch.cuda
33

4-
from torchtext.data import ReversibleField, BucketIterator
5-
64
import os
75
import json
86
import tarfile
97
import logging
8+
import re
109
from typing import List, Tuple
1110

1211
from .model import gru, lstm, bidir, conv, linear
@@ -172,24 +171,6 @@ def sostoken(self):
172171
def eostoken(self):
173172
return self.vocabulary.eos_token_index
174173

175-
def tag(self, iterator: BucketIterator):
176-
self.model.eval()
177-
for i, batch in enumerate(iterator):
178-
src, src_len = batch.src
179-
output, attention = self.model(
180-
src, src_len, trg=None,
181-
teacher_forcing_ratio=0
182-
) # turn off teacher forcing
183-
184-
# trg = [trg sent len, batch size]
185-
# output = [Maximum Sentence Length, Number of Sentence in batch, Number of possible characters]
186-
_, ind = torch.topk(output, 1, dim=2)
187-
# ind = [Maximum Sentence Length, Number of Sentences in Batch, One Result]
188-
189-
# output = output[1:].view(-1, output.shape[-1])
190-
191-
yield ind.squeeze().permute(1, 0)
192-
193174
@property
194175
def settings(self):
195176
return {
@@ -249,13 +230,15 @@ def annotate(self, texts: List[str], batch_size=32):
249230
translations = self.model.predict(
250231
tensor, sentence_length, label_encoder=self.vocabulary
251232
)
252-
253-
for index in range(len(batch)):
233+
for index in range(len(translations)):
254234
yield "".join(translations[order.index(index)])
255235

256-
def annotate_text(self, string, batch_size=32):
257-
strings = [
258-
string[n:n+self.out_max_sentence_length-10]
259-
for n in range(0, len(string), self.out_max_sentence_length - 10)
260-
]
236+
def annotate_text(self, string, splitter=r"(\W+)", batch_size=32):
237+
splitter = re.compile(splitter)
238+
splits = splitter.split(string)
239+
240+
tempList = splits + [""] * 2
241+
strings = ["".join(tempList[n:n + 2]) for n in range(0, len(splits), 2)]
242+
strings = list(filter(len, strings))
243+
261244
yield from self.annotate(strings, batch_size=batch_size)

linear_run.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
logger.setLevel(logging.DEBUG)
1010

1111
EPOCHS = 100
12-
TEST = "seints"
12+
TEST = "fro"
1313
RANDOM = True
1414
DEVICE = "cuda"
1515
MAXIMUM_LENGTH = 100

test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
train_path, dev_path, test_path = "data/fro/train.tsv", "data/fro/dev.tsv", "data/fro/test.tsv"
2222

2323

24-
for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--14:08:58-0.0001.tar"):
24+
for model in glob.glob("/home/thibault/dev/boudams/models/linear-conv2019-05-24--23:53:32-0.0001.tar"):
2525
tokenizer = Seq2SeqTokenizer.load(model, device=DEVICE)
2626
print("Model : " + tokenizer.system.upper() + " from " + model)
2727
test_data = tokenizer.vocabulary.get_dataset(test_path, randomized=False)

voc-2.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "r", "4": "!", "6": " ", "7": "q", "8": "t", "9": "p", "10": "v", "11": "x", "12": ";", "13": "l", "14": "n", "15": ".", "16": "s", "17": "g", "18": ")", "19": "o", "20": "z", "21": "e", "22": "k", "23": "(", "24": "d", "25": ":", "26": "a", "27": "?", "28": "u", "29": "b", "30": "j", "31": "i", "32": "'", "33": "y", "34": "f", "35": "m", "36": "w", "37": "c", "38": "]", "39": "[", "40": "h", "41": ",", "42": "-"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, "!": 4, "r": 5, " ": 6, "q": 7, "t": 8, "p": 9, "v": 10, "x": 11, ";": 12, "l": 13, "n": 14, ".": 15, "s": 16, "g": 17, ")": 18, "o": 19, "z": 20, "e": 21, "k": 22, "(": 23, "d": 24, ":": 25, "a": 26, "?": 27, "u": 28, "b": 29, "j": 30, "i": 31, "'": 32, "y": 33, "f": 34, "m": 35, "w": 36, "c": 37, "]": 38, "[": 39, "h": 40, ",": 41, "-": 42}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}
1+
{"itos": {"0": "<SOS>", "1": "<EOS>", "2": "<PAD>", "5": "1", "4": ",", "6": "6", "7": "r", "8": "2", "9": "s", "10": "u", "11": "o", "12": "h", "13": "k", "14": "3", "15": "a", "16": "z", "17": "c", "18": "m", "19": "\"", "20": "q", "21": ">", "22": "g", "23": "0", "24": "w", "25": "!", "26": ";", "27": "_", "28": "n", "29": " ", "30": "v", "31": "y", "32": ":", "33": "?", "34": "b", "35": "'", "36": "p", "37": "d", "38": "l", "39": "i", "40": ".", "41": "j", "42": "x", "43": "f", "44": "-", "45": "t", "46": "e"}, "stoi": {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 5, ",": 4, "1": 5, "6": 6, "r": 7, "2": 8, "s": 9, "u": 10, "o": 11, "h": 12, "k": 13, "3": 14, "a": 15, "z": 16, "c": 17, "m": 18, "\"": 19, "q": 20, ">": 21, "g": 22, "0": 23, "w": 24, "!": 25, ";": 26, "_": 27, "n": 28, " ": 29, "v": 30, "y": 31, ":": 32, "?": 33, "b": 34, "'": 35, "p": 36, "d": 37, "l": 38, "i": 39, ".": 40, "j": 41, "x": 42, "f": 43, "-": 44, "t": 45, "e": 46}, "params": {"init_token": "<SOS>", "eos_token": "<EOS>", "pad_token": "<PAD>", "unk_token": "<UNK>", "mask_token": "x", "remove_diacriticals": true, "lower": true, "masked": true}}

0 commit comments

Comments
 (0)