Skip to content

Commit e1c661e

Browse files
committed
Bump version, sync codebase
1 parent 82facf9 commit e1c661e

File tree

4 files changed

+23
-4
lines changed

4 files changed

+23
-4
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
This is the changelog for the open source version of tiktoken.
44

5+
## [v0.3.3]
6+
- `tiktoken` will now make a best effort attempt to replace surrogate pairs with the corresponding
7+
Unicode character and will replace lone surrogates with the Unicode replacement character.
8+
59
## [v0.3.2]
610
- Add encoding for GPT-4
711

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tiktoken"
3-
version = "0.3.2"
3+
version = "0.3.3"
44
edition = "2021"
55
rust-version = "1.57.0"
66

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "tiktoken"
3-
version = "0.3.2"
3+
version = "0.3.3"
44
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
55
readme = "README.md"
66
license = {file = "LICENSE"}

tiktoken/core.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,12 @@ def encode_ordinary(self, text: str) -> list[int]:
6565
>>> enc.encode_ordinary("hello world")
6666
[31373, 995]
6767
"""
68-
return self._core_bpe.encode_ordinary(text)
68+
try:
69+
return self._core_bpe.encode_ordinary(text)
70+
except UnicodeEncodeError:
71+
# See comment in encode
72+
text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
73+
return self._core_bpe.encode_ordinary(text)
6974

7075
def encode(
7176
self,
@@ -111,7 +116,17 @@ def encode(
111116
if match := _special_token_regex(disallowed_special).search(text):
112117
raise_disallowed_special_token(match.group())
113118

114-
return self._core_bpe.encode(text, allowed_special)
119+
try:
120+
return self._core_bpe.encode(text, allowed_special)
121+
except UnicodeEncodeError:
122+
# BPE operates on bytes, but the regex operates on unicode. If we pass a str that is
123+
# invalid UTF-8 to Rust, it will rightfully complain. Here we do a quick and dirty
124+
# fixup for any surrogate pairs that may have sneaked their way into the text.
125+
# Technically, this introduces a place where encode + decode doesn't roundtrip a Python
126+
# string, but given that this is input we want to support, maybe that's okay.
127+
# Also we use errors="replace" to handle weird things like lone surrogates.
128+
text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
129+
return self._core_bpe.encode(text, allowed_special)
115130

116131
def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
117132
"""Encodes a list of strings into tokens, in parallel, ignoring special tokens.

0 commit comments

Comments
 (0)