scrapinghub
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎.gitmodules
+6 b/‎.gitmodules
+6
diff --git a/‎README.rst
+61 b/‎README.rst
+61
diff --git a/‎datasets/__init__.py b/‎datasets/__init__.py
diff --git a/‎datasets/conll.py
+44 b/‎datasets/conll.py
+44
diff --git a/‎example.py
+65 b/‎example.py
+65
diff --git a/‎pycrfsuite/__init__.py
+42 b/‎pycrfsuite/__init__.py
+42
diff --git a/‎pycrfsuite/crfsuite_api.pxd
+38 b/‎pycrfsuite/crfsuite_api.pxd
+38
@@ -0,0 +1,3 @@
+*.pyc
+*.so
+
@@ -0,0 +1,6 @@
+[submodule "crfsuite"]
+	path = crfsuite
+	url = https://github.com/chokkan/crfsuite.git
+[submodule "liblbfgs"]
+	path = liblbfgs
+	url = https://github.com/chokkan/liblbfgs.git
@@ -0,0 +1,61 @@
+==========
+PyCRFsuite
+==========
+
+PyCRFsuite is a python binding of CRFsuite_ and mimic scikit-learn_ to provide a similar API.
+
+Why
+===
+Why make another python binding for CRFsuite_ even crfsuite has its own SWIG python package_? Some of the reasons are:
+
+* crfsuite SWIG package is not easy to build . There are some openning issues reported on crfsuite e.g. issue6_, issue19_
+* can't dump the model (with `crfsuite dump`) trained by the crfsuite's SWIG python package_.
+* wrapping a library with cython is fun and easy to extend.
+
+Usage
+=====
+
+The easiest way to start is to fetch a dataset in CoNLL 2000 format, define a feature extraction function, e.g.::
+
+    def features(words, i):
+        word = words[i]
+
+        yield "word:{}".format(word.lower())
+
+        if word[0].isupper():
+            yield "CAP"
+
+        if i > 0:
+            yield "word-1:{}".format(words[i - 1].lower())
+        if i + 1 < len(words):
+            yield "word+1:{}".format(words[i + 1].lower())
+
+Load the training file, say train.txt::
+
+    X = []
+    y = []
+
+    for xseq, yseq in load_conll('train.txt', features):
+        X.append(xseq)
+        y.append(yseq)
+
+Train a model::
+
+    from pycrfsuite import CRFsuite
+
+    clf = CRFSuite('model_name')
+    clf.fit(X, y)
+
+Authors
+======
+Terry Peng <pengtaoo@gmail.com>
+
+License
+=======
+Licensed under MIT license.
+
+.. _CRFsuite: https://github.com/chokkan/crfsuite
+.. _package: https://github.com/chokkan/crfsuite/swig/python
+.. _scikit-learn: http://scikit-learn.org/
+.. _issue6: https://github.com/chokkan/crfsuite/issues/6
+.. _issue19: https://github.com/chokkan/crfsuite/issues/19
@@ -0,0 +1,44 @@
+"""
+Load ConLL dataset and generate features
+"""
+from itertools import imap, groupby
+from contextlib import closing
+
+def _open(f):
+    return closing(open(f) if isinstance(f, basestring) else f)
+
+def load_conll(fname, features):
+    """Load ConLL file and extract features"""
+
+    with _open(fname) as f:
+        lines = imap(str.strip, f)
+        groups = (grp for nonempty, grp in groupby(lines, bool) if nonempty)
+
+        xseq = []
+        yseq = []
+
+        for group in groups:
+            group = list(group)
+            obs, lbl = zip(*(ln.rsplit(None, 1) for ln in group))
+
+            for i in xrange(len(obs)):
+                xseq.append(dict.fromkeys(features(obs, i), 1))
+                yseq.append(lbl[i])
+
+            if xseq and yseq:
+                yield xseq, yseq
+                xseq = []
+                yseq = []
+
+def features(words, i):
+    word = words[i]
+
+    yield "word:{}".format(word.lower())
+
+    if word[0].isupper():
+        yield "CAP"
+
+    if i > 0:
+        yield "word-1:{}".format(words[i - 1].lower())
+    if i + 1 < len(words):
+        yield "word+1:{}".format(words[i + 1].lower())
@@ -0,0 +1,65 @@
+"""
+demostrates how to use pycrfsuite for conll tasks.
+
+python example.py train modelname
+
+"""
+from datasets.conll import load_conll, features
+from pycrfsuite import CRFSuite
+from sklearn.cross_validation import train_test_split
+from sklearn.metrics import classification_report
+
+import sys
+from itertools import chain
+
+def avg_bio_f1_score(y_true, y_pred):
+    """
+    Macro-averaged F1 score of lists of BIO-encoded sequences
+    ``y_true`` and ``y_pred``.
+
+    A named entity in a sequence from ``y_pred`` is considered
+    correct only if it is an exact match of the corresponding entity
+    in the ``y_true``.
+
+    It requires https://github.com/larsmans/seqlearn to work.
+    """
+    from seqlearn.evaluation import bio_f_score
+    return sum(map(bio_f_score, y_true, y_pred)) / len(y_true)
+
+
+def bio_classification_report(y_true, y_pred):
+    """
+    Classification report for a list of BIO-encoded sequences.
+    It computes token-level metrics and discards "O" labels.
+    """
+    y_true_combined = list(chain.from_iterable(y_true))
+    y_pred_combined = list(chain.from_iterable(y_pred))
+    tagset = (set(y_true_combined) | set(y_pred_combined)) - {'O'}
+    return classification_report(
+        y_true_combined,
+        y_pred_combined,
+        labels = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
+    )
+
+if __name__ == '__main__':
+
+    print __doc__
+
+    if len(sys.argv) < 3:
+        print "Usage: {0} training_file modelname".format(sys.argv[0])
+        sys.exit(1)
+
+    clf = CRFSuite(sys.argv[2])
+
+    X = []
+    y = []
+
+    for xseq, yseq in load_conll(sys.argv[1], features):
+        X.append(xseq)
+        y.append(yseq)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
+    clf.fit(X_train, y_train)
+
+    y_pred = clf.predict(X_test)
+    print bio_classification_report(y_test, y_pred)
@@ -0,0 +1,42 @@
+"""
+CRFsuite scikit-learn API.
+"""
+import itertools
+from sklearn.base import BaseEstimator
+from ._crfsuite import PyTrainer, PyTagger, PyAttribute
+
+class CRFSuite(BaseEstimator):
+
+    def __init__(self, model_filename):
+        self.trainer = PyTrainer()
+        self.model_filename = model_filename
+
+    def fit(self, X, y):
+        for items, labels in itertools.izip(X, y):
+            xseq = self._to_xseq(items)
+            yseq = self._to_yseq(labels)
+            self.trainer.append(xseq, yseq, 0)
+
+        self.trainer.select('l2sgd', 'crf1d')
+        self.trainer.set('c2', '0.1')
+
+        self.trainer.train(self.model_filename, -1)
+
+    def predict(self, X):
+        tagger = PyTagger()
+        tagger.open(self.model_filename)
+
+        xseqs = [self._to_xseq(items) for items in X]
+        yseqs = []
+
+        for xseq in xseqs:
+            tagger.set(xseq)
+            yseqs.append(tagger.viterbi())
+
+        return yseqs
+
+    def _to_xseq(self, items):
+        return [[PyAttribute(k, v) for k, v in item.iteritems()] for item in items]
+
+    def _to_yseq(self, labels):
+        return [label.encode('utf8') for label in labels]
@@ -0,0 +1,38 @@
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+
+cdef extern from "../crfsuite/include/crfsuite_api.hpp" namespace "CRFSuite":
+    cdef cppclass Attribute:
+        string attr
+        double value
+
+        Attribute()
+        Attribute(string)
+        Attribute(string, double)
+
+    ctypedef vector[Attribute] Item
+    ctypedef vector[Item] ItemSequence
+    ctypedef vector[string] StringList
+
+    cdef cppclass Trainer:
+        Trainer() except +
+        void clear()
+        void append(ItemSequence, vector[string], int)
+        int select(string, string)
+        int train(string, int)
+        vector[string] params()
+        void set(string, string)
+        string get(string)
+        string help(string)
+        # message(String) ?
+
+    cdef cppclass Tagger:
+        Tagger() except +
+        int open(string)
+        void close()
+        vector[string] labels()
+        vector[string] tag(ItemSequence)
+        void set(ItemSequence)
+        vector[string] viterbi()
+        double probability(vector[string])
+        double marginal(string, int)