Skip to content

Commit 166ffa3

Browse files
cpuhrschfacebook-github-bot
authored andcommitted
python get_line / getLine remove rng for supervised / PyTorch
Summary: See title. Reviewed By: EdouardGrave Differential Revision: D6619903 fbshipit-source-id: 658ac873859860e64faec02c62568f69e6350797
1 parent add7db5 commit 166ffa3

File tree

9 files changed

+161
-57
lines changed

9 files changed

+161
-57
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright (c) 2017-present, Facebook, Inc.
4+
# All rights reserved.
5+
#
6+
# This source code is licensed under the BSD-style license found in the
7+
# LICENSE file in the root directory of this source tree. An additional grant
8+
# of patent rights can be found in the PATENTS file in the same directory.
9+
10+
# NOTE: This requires PyTorch! We do not provide installation scripts to install PyTorch.
11+
# It is up to you to install this dependency if you want to execute this example.
12+
# PyTorch's website should give you clear instructions on this: http://pytorch.org/
13+
14+
from __future__ import absolute_import
15+
from __future__ import division
16+
from __future__ import print_function
17+
from __future__ import unicode_literals
18+
from torch.nn.modules.sparse import EmbeddingBag
19+
import numpy as np
20+
import torch
21+
import random
22+
import string
23+
import time
24+
from fastText import load_model
25+
from torch.autograd import Variable
26+
27+
28+
class FastTextEmbeddingBag(EmbeddingBag):
29+
def __init__(self, model_path):
30+
self.model = load_model(model_path)
31+
input_matrix = self.model.get_input_matrix()
32+
input_matrix_shape = input_matrix.shape
33+
super().__init__(input_matrix_shape[0], input_matrix_shape[1])
34+
self.weight.data.copy_(torch.FloatTensor(input_matrix))
35+
36+
def forward(self, words):
37+
word_subinds = np.empty([0], dtype=np.int64)
38+
word_offsets = [0]
39+
for word in words:
40+
_, subinds = self.model.get_subwords(word)
41+
word_subinds = np.concatenate((word_subinds, subinds))
42+
word_offsets.append(word_offsets[-1] + len(subinds))
43+
word_offsets = word_offsets[:-1]
44+
ind = Variable(torch.LongTensor(word_subinds))
45+
offsets = Variable(torch.LongTensor(word_offsets))
46+
return super().forward(ind, offsets)
47+
48+
49+
def random_word(N):
50+
return ''.join(
51+
random.choices(
52+
string.ascii_uppercase + string.ascii_lowercase + string.digits,
53+
k=N
54+
)
55+
)
56+
57+
58+
if __name__ == "__main__":
59+
ft_emb = FastTextEmbeddingBag("fil9.bin")
60+
model = load_model("fil9.bin")
61+
num_lines = 200
62+
total_seconds = 0.0
63+
total_words = 0
64+
for _ in range(num_lines):
65+
words = [
66+
random_word(random.randint(1, 10))
67+
for _ in range(random.randint(15, 25))
68+
]
69+
total_words += len(words)
70+
words_average_length = sum([len(word) for word in words]) / len(words)
71+
start = time.clock()
72+
words_emb = ft_emb(words)
73+
total_seconds += (time.clock() - start)
74+
for i in range(len(words)):
75+
word = words[i]
76+
ft_word_emb = model.get_word_vector(word)
77+
py_emb = np.array(words_emb[i].data)
78+
assert (np.isclose(ft_word_emb, py_emb).all())
79+
print(
80+
"Avg. {:2.5f} seconds to build embeddings for {} lines with a total of {} words.".
81+
format(total_seconds, num_lines, total_words)
82+
)

python/doc/examples/train_supervised.py

+14-37
Original file line numberDiff line numberDiff line change
@@ -17,46 +17,23 @@
1717
from fastText import train_supervised
1818
from fastText.util import test
1919

20-
21-
# Return top-k predictions and probabilities for each line in the given file.
22-
def get_predictions(filename, model, k=1):
23-
predictions = []
24-
probabilities = []
25-
with open(filename) as f:
26-
for line in f:
27-
line = line.strip()
28-
labels, probs = model.predict(line, k)
29-
predictions.append(labels)
30-
probabilities.append(probs)
31-
return predictions, probabilities
32-
33-
34-
# Parse and return list of labels
35-
def get_labels_from_file(filename, prefix="__label__"):
36-
labels = []
37-
with open(filename) as f:
38-
for line in f:
39-
line_labels = []
40-
tokens = line.split()
41-
for token in tokens:
42-
if token.startswith(prefix):
43-
line_labels.append(token)
44-
labels.append(line_labels)
45-
return labels
46-
47-
4820
if __name__ == "__main__":
4921
train_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.train')
5022
valid_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.valid')
5123
# train_supervised uses the same arguments and defaults as the fastText cli
5224
model = train_supervised(
53-
input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1
25+
input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=1, minCount=1
5426
)
55-
k = 1
56-
predictions, _ = get_predictions(valid_data, model, k=k)
57-
valid_labels = get_labels_from_file(valid_data)
58-
p, r = test(predictions, valid_labels, k=k)
59-
print("N\t" + str(len(valid_labels)))
60-
print("P@{}\t{:.3f}".format(k, p))
61-
print("R@{}\t{:.3f}".format(k, r))
62-
model.save_model(train_data + '.bin')
27+
predictions = []
28+
true_labels = []
29+
with open(valid_data, 'r') as fid:
30+
for line in fid:
31+
words, labels = model.get_line(line.strip())
32+
pred_labels, probs = model.predict(" ".join(words))
33+
predictions += [pred_labels]
34+
true_labels += [labels]
35+
p, r = test(predictions, true_labels)
36+
print("N\t" + str(len(predictions)))
37+
print("P@{}\t{:.3f}".format(1, p))
38+
print("R@{}\t{:.3f}".format(1, r))
39+
model.save_model("cooking.bin")

python/doc/examples/train_unsupervised.py

+1
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,6 @@ def similarity(v1, v2):
5252
input=os.path.join(os.getenv("DATADIR", ''), 'fil9'),
5353
model='skipgram',
5454
)
55+
model.save_model("fil9.bin")
5556
dataset, corr, oov = compute_similarity('rw.txt')
5657
print("{0:20s}: {1:2.0f} (OOV: {2:2.0f}%)".format(dataset, corr, 0))

python/fastText/FastText.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,17 @@ def get_labels(self, include_freq=False):
172172
else:
173173
return self.get_words(include_freq)
174174

175+
def get_line(self, text):
176+
"""
177+
Split a line of text into words and labels. Labels must start with
178+
the prefix used to create the model (__label__ by default).
179+
"""
180+
if text.find('\n') != -1:
181+
raise ValueError(
182+
"get_line processes one line at a time (remove \'\\n\')"
183+
)
184+
return self.f.getLine(text)
185+
175186
def save_model(self, path):
176187
"""Save the model to the given path"""
177188
self.f.saveModel(path)
@@ -251,6 +262,10 @@ def _build_args(args):
251262

252263
def tokenize(text):
253264
"""Given a string of text, tokenize it and return a list of tokens"""
265+
if text.find('\n') != -1:
266+
raise ValueError(
267+
"tokenize processes one line at a time (remove \'\\n\')"
268+
)
254269
f = fasttext.fasttext()
255270
return f.tokenize(text)
256271

@@ -330,7 +345,7 @@ def train_unsupervised(
330345
as UTF-8. You might want to consult standard preprocessing scripts such
331346
as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html
332347
333-
The input fiel must not contain any labels or use the specified label prefix
348+
The input field must not contain any labels or use the specified label prefix
334349
unless it is ok for those words to be ignored. For an example consult the
335350
dataset pulled by the example script word-vector-example.sh, which is
336351
part of the fastText repository.

python/fastText/pybind/fasttext_pybind.cc

+29-9
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,10 @@ PYBIND11_MODULE(fasttext_pybind, m) {
6363
.value("softmax", fasttext::loss_name::softmax)
6464
.export_values();
6565

66-
m.def("train", [](fasttext::FastText& ft, fasttext::Args& a) {
67-
ft.train(a);
68-
}, py::call_guard<py::gil_scoped_release>());
66+
m.def(
67+
"train",
68+
[](fasttext::FastText& ft, fasttext::Args& a) { ft.train(a); },
69+
py::call_guard<py::gil_scoped_release>());
6970

7071
py::class_<fasttext::Vector>(m, "Vector", py::buffer_protocol())
7172
.def(py::init<ssize_t>())
@@ -120,17 +121,15 @@ PYBIND11_MODULE(fasttext_pybind, m) {
120121
[](fasttext::FastText& m,
121122
fasttext::Vector& v,
122123
const std::string text) {
123-
std::stringstream ioss;
124-
copy(text.begin(), text.end(), std::ostream_iterator<char>(ioss));
124+
std::stringstream ioss(text);
125125
m.getSentenceVector(ioss, v);
126126
})
127127
.def(
128128
"tokenize",
129129
[](fasttext::FastText& m, const std::string text) {
130130
std::vector<std::string> text_split;
131131
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
132-
std::stringstream ioss;
133-
copy(text.begin(), text.end(), std::ostream_iterator<char>(ioss));
132+
std::stringstream ioss(text);
134133
std::string token;
135134
while (!ioss.eof()) {
136135
while (d->readWord(ioss, token)) {
@@ -139,6 +138,28 @@ PYBIND11_MODULE(fasttext_pybind, m) {
139138
}
140139
return text_split;
141140
})
141+
.def(
142+
"getLine",
143+
[](fasttext::FastText& m, const std::string text) {
144+
std::shared_ptr<const fasttext::Dictionary> d = m.getDictionary();
145+
std::stringstream ioss(text);
146+
std::string token;
147+
std::vector<std::string> words;
148+
std::vector<std::string> labels;
149+
while (!ioss.eof()) {
150+
while (d->readWord(ioss, token)) {
151+
fasttext::entry_type type = d->getType(token);
152+
if (type == fasttext::entry_type::word) {
153+
words.push_back(token);
154+
} else {
155+
labels.push_back(token);
156+
}
157+
}
158+
}
159+
return std::
160+
pair<std::vector<std::string>, std::vector<std::string>>(
161+
words, labels);
162+
})
142163
.def(
143164
"getVocab",
144165
[](fasttext::FastText& m) {
@@ -199,8 +220,7 @@ PYBIND11_MODULE(fasttext_pybind, m) {
199220
// to exactly mimic the behavior of the cli
200221
[](fasttext::FastText& m, const std::string text, int32_t k) {
201222
std::vector<std::pair<fasttext::real, std::string>> predictions;
202-
std::stringstream ioss;
203-
copy(text.begin(), text.end(), std::ostream_iterator<char>(ioss));
223+
std::stringstream ioss(text);
204224
m.predict(ioss, k, predictions);
205225
return predictions;
206226
})

python/fastText/tests/test_script.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,18 @@ def gen_test_subwords(self, kwargs):
227227
def gen_test_tokenize(self, kwargs):
228228
self.assertEqual(["asdf", "asdb"], fastText.tokenize("asdf asdb"))
229229
self.assertEqual(["asdf"], fastText.tokenize("asdf"))
230-
self.assertEqual(["asdf", fastText.EOS], fastText.tokenize("asdf\n"))
231-
self.assertEqual([fastText.EOS], fastText.tokenize("\n"))
230+
gotError = False
231+
try:
232+
self.assertEqual([fastText.EOS], fastText.tokenize("\n"))
233+
except ValueError:
234+
gotError = True
235+
self.assertTrue(gotError)
236+
gotError = False
237+
try:
238+
self.assertEqual(["asdf", fastText.EOS], fastText.tokenize("asdf\n"))
239+
except ValueError:
240+
gotError = True
241+
self.assertTrue(gotError)
232242
self.assertEqual([], fastText.tokenize(""))
233243
self.assertEqual([], fastText.tokenize(" "))
234244
# An empty string is not a token (it's just whitespace)

src/dictionary.cc

+1-2
Original file line numberDiff line numberDiff line change
@@ -351,8 +351,7 @@ int32_t Dictionary::getLine(std::istream& in,
351351

352352
int32_t Dictionary::getLine(std::istream& in,
353353
std::vector<int32_t>& words,
354-
std::vector<int32_t>& labels,
355-
std::minstd_rand& rng) const {
354+
std::vector<int32_t>& labels) const {
356355
std::vector<int32_t> word_hashes;
357356
std::string token;
358357
int32_t ntokens = 0;

src/dictionary.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ class Dictionary {
9898
void save(std::ostream&) const;
9999
void load(std::istream&);
100100
std::vector<int64_t> getCounts(entry_type) const;
101-
int32_t getLine(std::istream&, std::vector<int32_t>&,
102-
std::vector<int32_t>&, std::minstd_rand&) const;
101+
int32_t getLine(std::istream&, std::vector<int32_t>&, std::vector<int32_t>&)
102+
const;
103103
int32_t getLine(std::istream&, std::vector<int32_t>&,
104104
std::minstd_rand&) const;
105105
void threshold(int64_t, int64_t);

src/fasttext.cc

+4-4
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ void FastText::test(std::istream& in, int32_t k) {
366366
std::vector<int32_t> line, labels;
367367

368368
while (in.peek() != EOF) {
369-
dict_->getLine(in, line, labels, model_->rng);
369+
dict_->getLine(in, line, labels);
370370
if (labels.size() > 0 && line.size() > 0) {
371371
std::vector<std::pair<real, int32_t>> modelPredictions;
372372
model_->predict(line, k, modelPredictions);
@@ -390,7 +390,7 @@ void FastText::predict(std::istream& in, int32_t k,
390390
std::vector<std::pair<real,std::string>>& predictions) const {
391391
std::vector<int32_t> words, labels;
392392
predictions.clear();
393-
dict_->getLine(in, words, labels, model_->rng);
393+
dict_->getLine(in, words, labels);
394394
predictions.clear();
395395
if (words.empty()) return;
396396
Vector hidden(args_->dim);
@@ -430,7 +430,7 @@ void FastText::getSentenceVector(
430430
svec.zero();
431431
if (args_->model == model_name::sup) {
432432
std::vector<int32_t> line, labels;
433-
dict_->getLine(in, line, labels, model_->rng);
433+
dict_->getLine(in, line, labels);
434434
for (int32_t i = 0; i < line.size(); i++) {
435435
addInputVector(svec, line[i]);
436436
}
@@ -578,7 +578,7 @@ void FastText::trainThread(int32_t threadId) {
578578
real progress = real(tokenCount_) / (args_->epoch * ntokens);
579579
real lr = args_->lr * (1.0 - progress);
580580
if (args_->model == model_name::sup) {
581-
localTokenCount += dict_->getLine(ifs, line, labels, model.rng);
581+
localTokenCount += dict_->getLine(ifs, line, labels);
582582
supervised(model, lr, line, labels);
583583
} else if (args_->model == model_name::cbow) {
584584
localTokenCount += dict_->getLine(ifs, line, model.rng);

0 commit comments

Comments
 (0)