Skip to content

Commit 2211639

Browse files
ajoulinFacebook Github Bot
authored and
Facebook Github Bot
committed
Add a -minCountLabel option
Summary: In order to reproduce the results from [2] on YFCC100M, we add an option to remove unfrequent labels. Reviewed By: EdouardGrave Differential Revision: D4031684 fbshipit-source-id: c3724706dc0ae35e7d9cc6d08a52cdd4b9d4bccc
1 parent a88344f commit 2211639

File tree

6 files changed

+18
-6
lines changed

6 files changed

+18
-6
lines changed

README.md

+5
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ The following arguments are optional:
133133
-ws size of the context window [5]
134134
-epoch number of epochs [5]
135135
-minCount minimal number of word occurences [1]
136+
-minCountLabel minimal number of label occurences [0]
136137
-neg number of negatives sampled [5]
137138
-wordNgrams max length of word ngram [1]
138139
-loss loss function {ns, hs, softmax} [ns]
@@ -180,6 +181,10 @@ Please cite [1](#enriching-word-vectors-with-subword-information) if using this
180181

181182
(\* These authors contributed equally.)
182183

184+
## Resources
185+
186+
You can find the preprocessed YFCC100M data used in [2] at https://research.facebook.com/research/fasttext/
187+
183188
## Join the fastText community
184189

185190
* Facebook page: https://www.facebook.com/groups/1174547215919768

src/args.cc

+4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Args::Args() {
2222
ws = 5;
2323
epoch = 5;
2424
minCount = 5;
25+
minCountLabel = 0;
2526
neg = 5;
2627
wordNgrams = 1;
2728
loss = loss_name::ns;
@@ -78,6 +79,8 @@ void Args::parseArgs(int argc, char** argv) {
7879
epoch = atoi(argv[ai + 1]);
7980
} else if (strcmp(argv[ai], "-minCount") == 0) {
8081
minCount = atoi(argv[ai + 1]);
82+
} else if (strcmp(argv[ai], "-minCountLabel") == 0) {
83+
minCountLabel = atoi(argv[ai + 1]);
8184
} else if (strcmp(argv[ai], "-neg") == 0) {
8285
neg = atoi(argv[ai + 1]);
8386
} else if (strcmp(argv[ai], "-wordNgrams") == 0) {
@@ -143,6 +146,7 @@ void Args::printHelp() {
143146
<< " -ws size of the context window [" << ws << "]\n"
144147
<< " -epoch number of epochs [" << epoch << "]\n"
145148
<< " -minCount minimal number of word occurences [" << minCount << "]\n"
149+
<< " -minCountLabel minimal number of label occurences [" << minCountLabel << "]\n"
146150
<< " -neg number of negatives sampled [" << neg << "]\n"
147151
<< " -wordNgrams max length of word ngram [" << wordNgrams << "]\n"
148152
<< " -loss loss function {ns, hs, softmax} [ns]\n"

src/args.h

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class Args {
3131
int ws;
3232
int epoch;
3333
int minCount;
34+
int minCountLabel;
3435
int neg;
3536
int wordNgrams;
3637
loss_name loss;

src/dictionary.cc

+6-4
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,11 @@ void Dictionary::readFromFile(std::istream& in) {
180180
std::cout << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush;
181181
}
182182
if (size_ > 0.75 * MAX_VOCAB_SIZE) {
183-
threshold(minThreshold++);
183+
minThreshold++;
184+
threshold(minThreshold, minThreshold);
184185
}
185186
}
186-
threshold(args_->minCount);
187+
threshold(args_->minCount, args_->minCountLabel);
187188
initTableDiscard();
188189
initNgrams();
189190
if (args_->verbose > 0) {
@@ -197,13 +198,14 @@ void Dictionary::readFromFile(std::istream& in) {
197198
}
198199
}
199200

200-
void Dictionary::threshold(int64_t t) {
201+
void Dictionary::threshold(int64_t t, int64_t tl) {
201202
sort(words_.begin(), words_.end(), [](const entry& e1, const entry& e2) {
202203
if (e1.type != e2.type) return e1.type < e2.type;
203204
return e1.count > e2.count;
204205
});
205206
words_.erase(remove_if(words_.begin(), words_.end(), [&](const entry& e) {
206-
return e.type == entry_type::word && e.count < t;
207+
return (e.type == entry_type::word && e.count < t) ||
208+
(e.type == entry_type::label && e.count < tl);
207209
}), words_.end());
208210
words_.shrink_to_fit();
209211
size_ = 0;

src/dictionary.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class Dictionary {
7777
void addNgrams(std::vector<int32_t>&, int32_t) const;
7878
int32_t getLine(std::istream&, std::vector<int32_t>&,
7979
std::vector<int32_t>&, std::minstd_rand&) const;
80-
void threshold(int64_t);
80+
void threshold(int64_t, int64_t);
8181
};
8282

8383
}

src/fasttext.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ void FastText::loadVectors(std::string filename) {
309309
}
310310
in.close();
311311

312-
dict_->threshold(1);
312+
dict_->threshold(1, 0);
313313
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
314314
input_->uniform(1.0 / args_->dim);
315315

0 commit comments

Comments
 (0)