Skip to content

Commit 7f4f816

Browse files
committed
initial checkin
0 parents  commit 7f4f816

13 files changed

+5203
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
*.pyc
2+
*.so
3+

.gitmodules

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[submodule "crfsuite"]
2+
path = crfsuite
3+
url = https://github.com/chokkan/crfsuite.git
4+
[submodule "liblbfgs"]
5+
path = liblbfgs
6+
url = https://github.com/chokkan/liblbfgs.git

README.rst

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
==========
2+
PyCRFsuite
3+
==========
4+
5+
PyCRFsuite is a python binding of CRFsuite_ and mimic scikit-learn_ to provide a similar API.
6+
7+
Why
8+
===
9+
Why make another python binding for CRFsuite_ even crfsuite has its own SWIG python package_? Some of the reasons are:
10+
11+
* crfsuite SWIG package is not easy to build . There are some openning issues reported on crfsuite e.g. issue6_, issue19_
12+
* can't dump the model (with `crfsuite dump`) trained by the crfsuite's SWIG python package_.
13+
* wrapping a library with cython is fun and easy to extend.
14+
15+
Usage
16+
=====
17+
18+
The easiest way to start is to fetch a dataset in CoNLL 2000 format, define a feature extraction function, e.g.::
19+
20+
def features(words, i):
21+
word = words[i]
22+
23+
yield "word:{}".format(word.lower())
24+
25+
if word[0].isupper():
26+
yield "CAP"
27+
28+
if i > 0:
29+
yield "word-1:{}".format(words[i - 1].lower())
30+
if i + 1 < len(words):
31+
yield "word+1:{}".format(words[i + 1].lower())
32+
33+
Load the training file, say train.txt::
34+
35+
X = []
36+
y = []
37+
38+
for xseq, yseq in load_conll('train.txt', features):
39+
X.append(xseq)
40+
y.append(yseq)
41+
42+
Train a model::
43+
44+
from pycrfsuite import CRFsuite
45+
46+
clf = CRFSuite('model_name')
47+
clf.fit(X, y)
48+
49+
Authors
50+
======
51+
Terry Peng <pengtaoo@gmail.com>
52+
53+
License
54+
=======
55+
Licensed under MIT license.
56+
57+
.. _CRFsuite: https://github.com/chokkan/crfsuite
58+
.. _package: https://github.com/chokkan/crfsuite/swig/python
59+
.. _scikit-learn: http://scikit-learn.org/
60+
.. _issue6: https://github.com/chokkan/crfsuite/issues/6
61+
.. _issue19: https://github.com/chokkan/crfsuite/issues/19

datasets/__init__.py

Whitespace-only changes.

datasets/conll.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""
2+
Load ConLL dataset and generate features
3+
"""
4+
from itertools import imap, groupby
5+
from contextlib import closing
6+
7+
def _open(f):
8+
return closing(open(f) if isinstance(f, basestring) else f)
9+
10+
def load_conll(fname, features):
11+
"""Load ConLL file and extract features"""
12+
13+
with _open(fname) as f:
14+
lines = imap(str.strip, f)
15+
groups = (grp for nonempty, grp in groupby(lines, bool) if nonempty)
16+
17+
xseq = []
18+
yseq = []
19+
20+
for group in groups:
21+
group = list(group)
22+
obs, lbl = zip(*(ln.rsplit(None, 1) for ln in group))
23+
24+
for i in xrange(len(obs)):
25+
xseq.append(dict.fromkeys(features(obs, i), 1))
26+
yseq.append(lbl[i])
27+
28+
if xseq and yseq:
29+
yield xseq, yseq
30+
xseq = []
31+
yseq = []
32+
33+
def features(words, i):
34+
word = words[i]
35+
36+
yield "word:{}".format(word.lower())
37+
38+
if word[0].isupper():
39+
yield "CAP"
40+
41+
if i > 0:
42+
yield "word-1:{}".format(words[i - 1].lower())
43+
if i + 1 < len(words):
44+
yield "word+1:{}".format(words[i + 1].lower())

example.py

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""
2+
demostrates how to use pycrfsuite for conll tasks.
3+
4+
python example.py train modelname
5+
6+
"""
7+
from datasets.conll import load_conll, features
8+
from pycrfsuite import CRFSuite
9+
from sklearn.cross_validation import train_test_split
10+
from sklearn.metrics import classification_report
11+
12+
import sys
13+
from itertools import chain
14+
15+
def avg_bio_f1_score(y_true, y_pred):
16+
"""
17+
Macro-averaged F1 score of lists of BIO-encoded sequences
18+
``y_true`` and ``y_pred``.
19+
20+
A named entity in a sequence from ``y_pred`` is considered
21+
correct only if it is an exact match of the corresponding entity
22+
in the ``y_true``.
23+
24+
It requires https://github.com/larsmans/seqlearn to work.
25+
"""
26+
from seqlearn.evaluation import bio_f_score
27+
return sum(map(bio_f_score, y_true, y_pred)) / len(y_true)
28+
29+
30+
def bio_classification_report(y_true, y_pred):
31+
"""
32+
Classification report for a list of BIO-encoded sequences.
33+
It computes token-level metrics and discards "O" labels.
34+
"""
35+
y_true_combined = list(chain.from_iterable(y_true))
36+
y_pred_combined = list(chain.from_iterable(y_pred))
37+
tagset = (set(y_true_combined) | set(y_pred_combined)) - {'O'}
38+
return classification_report(
39+
y_true_combined,
40+
y_pred_combined,
41+
labels = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
42+
)
43+
44+
if __name__ == '__main__':
45+
46+
print __doc__
47+
48+
if len(sys.argv) < 3:
49+
print "Usage: {0} training_file modelname".format(sys.argv[0])
50+
sys.exit(1)
51+
52+
clf = CRFSuite(sys.argv[2])
53+
54+
X = []
55+
y = []
56+
57+
for xseq, yseq in load_conll(sys.argv[1], features):
58+
X.append(xseq)
59+
y.append(yseq)
60+
61+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
62+
clf.fit(X_train, y_train)
63+
64+
y_pred = clf.predict(X_test)
65+
print bio_classification_report(y_test, y_pred)

pycrfsuite/__init__.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
CRFsuite scikit-learn API.
3+
"""
4+
import itertools
5+
from sklearn.base import BaseEstimator
6+
from ._crfsuite import PyTrainer, PyTagger, PyAttribute
7+
8+
class CRFSuite(BaseEstimator):
9+
10+
def __init__(self, model_filename):
11+
self.trainer = PyTrainer()
12+
self.model_filename = model_filename
13+
14+
def fit(self, X, y):
15+
for items, labels in itertools.izip(X, y):
16+
xseq = self._to_xseq(items)
17+
yseq = self._to_yseq(labels)
18+
self.trainer.append(xseq, yseq, 0)
19+
20+
self.trainer.select('l2sgd', 'crf1d')
21+
self.trainer.set('c2', '0.1')
22+
23+
self.trainer.train(self.model_filename, -1)
24+
25+
def predict(self, X):
26+
tagger = PyTagger()
27+
tagger.open(self.model_filename)
28+
29+
xseqs = [self._to_xseq(items) for items in X]
30+
yseqs = []
31+
32+
for xseq in xseqs:
33+
tagger.set(xseq)
34+
yseqs.append(tagger.viterbi())
35+
36+
return yseqs
37+
38+
def _to_xseq(self, items):
39+
return [[PyAttribute(k, v) for k, v in item.iteritems()] for item in items]
40+
41+
def _to_yseq(self, labels):
42+
return [label.encode('utf8') for label in labels]

pycrfsuite/crfsuite_api.pxd

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from libcpp.string cimport string
2+
from libcpp.vector cimport vector
3+
4+
cdef extern from "../crfsuite/include/crfsuite_api.hpp" namespace "CRFSuite":
5+
cdef cppclass Attribute:
6+
string attr
7+
double value
8+
9+
Attribute()
10+
Attribute(string)
11+
Attribute(string, double)
12+
13+
ctypedef vector[Attribute] Item
14+
ctypedef vector[Item] ItemSequence
15+
ctypedef vector[string] StringList
16+
17+
cdef cppclass Trainer:
18+
Trainer() except +
19+
void clear()
20+
void append(ItemSequence, vector[string], int)
21+
int select(string, string)
22+
int train(string, int)
23+
vector[string] params()
24+
void set(string, string)
25+
string get(string)
26+
string help(string)
27+
# message(String) ?
28+
29+
cdef cppclass Tagger:
30+
Tagger() except +
31+
int open(string)
32+
void close()
33+
vector[string] labels()
34+
vector[string] tag(ItemSequence)
35+
void set(ItemSequence)
36+
vector[string] viterbi()
37+
double probability(vector[string])
38+
double marginal(string, int)

0 commit comments

Comments
 (0)