@@ -65,7 +65,12 @@ def encode_ordinary(self, text: str) -> list[int]:
65
65
>>> enc.encode_ordinary("hello world")
66
66
[31373, 995]
67
67
"""
68
- return self ._core_bpe .encode_ordinary (text )
68
+ try :
69
+ return self ._core_bpe .encode_ordinary (text )
70
+ except UnicodeEncodeError :
71
+ # See comment in encode
72
+ text = text .encode ("utf-16" , "surrogatepass" ).decode ("utf-16" , "replace" )
73
+ return self ._core_bpe .encode_ordinary (text )
69
74
70
75
def encode (
71
76
self ,
@@ -111,7 +116,17 @@ def encode(
111
116
if match := _special_token_regex (disallowed_special ).search (text ):
112
117
raise_disallowed_special_token (match .group ())
113
118
114
- return self ._core_bpe .encode (text , allowed_special )
119
+ try :
120
+ return self ._core_bpe .encode (text , allowed_special )
121
+ except UnicodeEncodeError :
122
+ # BPE operates on bytes, but the regex operates on unicode. If we pass a str that is
123
+ # invalid UTF-8 to Rust, it will rightfully complain. Here we do a quick and dirty
124
+ # fixup for any surrogate pairs that may have sneaked their way into the text.
125
+ # Technically, this introduces a place where encode + decode doesn't roundtrip a Python
126
+ # string, but given that this is input we want to support, maybe that's okay.
127
+ # Also we use errors="replace" to handle weird things like lone surrogates.
128
+ text = text .encode ("utf-16" , "surrogatepass" ).decode ("utf-16" , "replace" )
129
+ return self ._core_bpe .encode (text , allowed_special )
115
130
116
131
def encode_ordinary_batch (self , text : list [str ], * , num_threads : int = 8 ) -> list [list [int ]]:
117
132
"""Encodes a list of strings into tokens, in parallel, ignoring special tokens.
0 commit comments