Skip to content

Commit 4014eb0

Browse files
Elijah Rippethfgregg
Elijah Rippeth
authored andcommitted
make default string encoding utf-8 to handle non-ascii X's and y's
1 parent 75c3081 commit 4014eb0

File tree

3 files changed

+35
-19
lines changed

3 files changed

+35
-19
lines changed

pycrfsuite/_pycrfsuite.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# cython: embedsignature=True
22
# cython: c_string_type=str
3-
# cython: c_string_encoding=ascii
3+
# cython: c_string_encoding=utf-8
44
# cython: profile=False
55
# distutils: language=c++
66
from . cimport crfsuite_api

tests/conftest.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,27 @@ def xseq():
1212
{"clean": 1, "shop": 0.1},
1313
{"walk": 1, "shop": 0.5},
1414
{},
15-
{'clean': 1},
16-
{u'солнце': u'не светит'.encode('utf8'), 'clean': 1},
17-
{'world': 2}
15+
{"clean": 1},
16+
{"солнце": "не светит".encode(), "clean": 1},
17+
{"world": 2},
1818
]
1919

2020

2121
@pytest.fixture
2222
def yseq():
23-
return ['sunny', 'sunny', u'sunny', 'rainy', 'rainy', 'rainy',
24-
'sunny', 'sunny', 'rainy', 'rainy', '好']
23+
return [
24+
"sunny",
25+
"sunny",
26+
"sunny",
27+
"rainy",
28+
"rainy",
29+
"rainy",
30+
"sunny",
31+
"sunny",
32+
"rainy",
33+
"rainy",
34+
"好",
35+
]
2536

2637

2738
@pytest.fixture

tests/test_tagger.py

+18-13
Original file line numberDiff line numberDiff line change
@@ -171,12 +171,17 @@ def test_info(model_filename):
171171
with Tagger().open(model_filename) as tagger:
172172
res = tagger.info()
173173

174-
assert res.transitions[('sunny', 'sunny')] > res.transitions[('sunny', 'rainy')]
175-
assert res.state_features[('walk', 'sunny')] > res.state_features[('walk', 'rainy')]
176-
assert (u'солнце:не светит', u'rainy') in res.state_features
177-
assert res.header['num_labels'] == '3'
178-
assert set(res.labels.keys()) == set(['sunny', 'rainy', '好'])
179-
assert set(res.attributes.keys()) == set(['shop', 'walk', 'clean', u'солнце:не светит', 'world'])
174+
assert res.transitions[("sunny", "sunny")] > res.transitions[("sunny", "rainy")]
175+
assert (
176+
res.state_features[("walk", "sunny")]
177+
> res.state_features[("walk", "rainy")]
178+
)
179+
assert ("солнце:не светит", "rainy") in res.state_features
180+
assert res.header["num_labels"] == "3"
181+
assert set(res.labels.keys()) == set(["sunny", "rainy", "好"])
182+
assert set(res.attributes.keys()) == set(
183+
["shop", "walk", "clean", "солнце:не светит", "world"]
184+
)
180185

181186
# it shouldn't segfault on a closed tagger
182187
with pytest.raises(RuntimeError):
@@ -222,13 +227,13 @@ def test_append_nested_dicts(tmpdir):
222227
with Tagger().open(model_filename) as tagger:
223228
info = tagger.info()
224229
assert set(info.attributes.keys()) == {
225-
"foo:bar:baz",
226-
"foo:spam",
227-
"foo:egg:x",
228-
"foo:egg:y",
229-
"foo:ham:x",
230-
"foo:ham:y",
231-
"foo:bar:ham",
230+
"foo:bar:baz",
231+
"foo:spam",
232+
"foo:egg:x",
233+
"foo:egg:y",
234+
"foo:ham:x",
235+
"foo:ham:y",
236+
"foo:bar:ham",
232237
}
233238

234239
for feat in ["foo:bar:baz", "foo:spam", "foo:egg:x", "foo:egg:y"]:

0 commit comments

Comments
 (0)