Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compile html5lib with Cython #524

Draft
wants to merge 22 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c52e731
Get rid of getPhases
gsnedders Jun 23, 2020
8cff6aa
fixup! Get rid of getPhases
gsnedders Jun 23, 2020
6eb4d2d
Move tests
gsnedders Oct 18, 2020
d2474af
Make InputStream.readChunk default an int
gsnedders Jan 4, 2021
0904df3
Remove last trace of Tokenizer.lastFourChars
gsnedders Jan 4, 2021
8ebff2e
Move Tokenizer.state to Tokenizer._state
gsnedders Jan 4, 2021
4a8e28a
Instead of comparing with a set of ints, use maths
gsnedders Jan 4, 2021
2ae13cc
Remove unused Tokenizer.escape/escapeFlag
gsnedders Jan 4, 2021
c22d069
Avoid needless setter write, mutate value directly
gsnedders Jan 5, 2021
81b3aaf
Reduce list/tuple access
gsnedders Jan 5, 2021
47df02b
Move lowercasing to _ascii module
gsnedders Jan 5, 2021
7d7a079
Always initialize Parser.tokenizer
gsnedders Jan 5, 2021
1acb5dd
Remove long unused Parser.lastPhase/Parser.beforeRCDataPhase
gsnedders Jan 5, 2021
b6a6484
Speed-up Parser.mainLoop a bit
gsnedders Jan 5, 2021
4822712
Get rid of more frozenset calls around constants
gsnedders Jan 5, 2021
f06451e
Add assert for leavingThisState
gsnedders Oct 30, 2020
9e9ff5f
Avoid recursion in etree.testSerializer
gsnedders Oct 27, 2020
2036738
Get rid of remaining non-decorator property()
gsnedders Oct 27, 2020
2c8e0ec
Call super().f() rather than Base.f(self)
gsnedders Jan 5, 2021
84cbc20
Move _getEtreeTag out of the class
gsnedders Oct 29, 2020
8b89668
Change attributes to be created as dicts from day one
gsnedders Oct 29, 2020
e65c433
Start of Cythonizing the tokenizer
gsnedders Jan 4, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "testdata"]
path = html5lib/tests/testdata
path = tests/testdata
url = https://github.com/html5lib/html5lib-tests.git
2,642 changes: 1,321 additions & 1,321 deletions .pytest.expect

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ include .pytest.expect
include tox.ini
include pytest.ini
graft html5lib/tests/testdata
recursive-include html5lib *.pxd
recursive-include html5lib/tests *.py
10 changes: 5 additions & 5 deletions benchmarks/bench_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pyperf

sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
#sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
import html5lib # noqa: E402


Expand Down Expand Up @@ -49,9 +49,9 @@ def add_cmdline_args(cmd, args):
source = io.BytesIO(fh.read())

if "parse" in benchmarks:
for tb in ("etree", "dom", "lxml"):
for tb in ("etree",):
runner.bench_func("html_parse_%s" % tb, bench_parse, source, tb)

if "serialize" in benchmarks:
for tb in ("etree", "dom", "lxml"):
runner.bench_time_func("html_serialize_%s" % tb, bench_serialize, source, tb)
# if "serialize" in benchmarks:
# for tb in ("etree",):
# runner.bench_time_func("html_serialize_%s" % tb, bench_serialize, source, tb)
2 changes: 1 addition & 1 deletion benchmarks/bench_wpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pyperf

sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
#sys.path[0:0] = [os.path.join(os.path.dirname(__file__), "..")]
import html5lib # noqa: E402


Expand Down
5 changes: 5 additions & 0 deletions html5lib/_ascii.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .constants import asciiUpper2Lower


def ascii_lower(s):
return s.translate(asciiUpper2Lower)
109 changes: 109 additions & 0 deletions html5lib/_inputstream.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# cython: language_level=3
cimport cython
from cpython cimport array

ctypedef void (*rCEf)(HTMLUnicodeInputStream, unicode) except *

cdef dict charsUntilCache

cdef class BufferedStream(object):
cdef object stream
cdef object buffer
cdef object position
cpdef object tell(self)
cpdef object seek(self, object pos)
cpdef object read(self, object bytes)
cdef object _bufferedBytes(self)
cdef object _readStream(self, object bytes)
cdef object _readFromBuffer(self, object bytes)

#def HTMLInputStream(source, object **kwargs)

cdef class HTMLUnicodeInputStream(object):
cdef rCEf reportCharacterErrors
cdef object newLines
cdef readonly object charEncoding
cdef object dataStream
cdef unicode chunk
cdef Py_ssize_t chunkSize
cdef Py_ssize_t chunkOffset
cdef readonly list errors

# number of (complete) lines in previous chunks
cdef Py_ssize_t prevNumLines
# number of columns in the last line of the previous chunk
cdef Py_ssize_t prevNumCols

# Deal with CR LF and surrogates split over chunk boundaries
cdef unicode _bufferedCharacter

cdef object reset(self)
cdef object openStream(self, object source)

@cython.locals(nLines=Py_ssize_t, lastLinePos=Py_ssize_t)
cdef tuple _position(self, Py_ssize_t offset)
cpdef tuple position(self)

@cython.locals(chunkOffset=Py_ssize_t, char=unicode)
cpdef unicode char(self)

@cython.locals(data=unicode)
cdef bint readChunk(self, Py_ssize_t chunkSize=?) except? -1

@cython.locals(c=ulong)
cdef void characterErrorsUCS4(self, unicode data) except *
cdef void characterErrorsUCS2(self, unicode data) except *

cpdef object charsUntil(self, object characters, bint opposite=?)
cpdef object unget(self, object char)

cdef class HTMLBinaryInputStream(HTMLUnicodeInputStream):
cdef object rawStream
cdef readonly object numBytesMeta
cdef readonly object numBytesChardet
cdef object override_encoding
cdef object transport_encoding
cdef object same_origin_parent_encoding
cdef object likely_encoding
cdef object default_encoding
cdef object reset(self)
cdef object openStream(self, object source)
cdef object determineEncoding(self, object chardet=?)
cpdef object changeEncoding(self, object newEncoding)
@cython.locals(string=bytes)
cdef object detectBOM(self)
cdef object detectEncodingMeta(self)

# cdef class EncodingBytes(bytes):
# cdef object previous(self)
# cdef object setPosition(self, object position)
# cdef object getPosition(self)
# cdef object getCurrentByte(self)
# cdef object skip(self, object chars=?)
# cdef object skipUntil(self, object chars)
# cdef object matchBytes(self, object bytes)
# cdef object jumpTo(self, object bytes)

ctypedef bint (*encstate)(EncodingParser) except? -1

cdef class EncodingParser(object):
cdef object data
cdef object encoding

@cython.locals(func=encstate, keepParsing=bint)
cdef object getEncoding(self)
cdef bint handleComment(self) except? -1
@cython.locals(hasPragma=bint, name=bytes, value=bytes, tentativeEncoding=bytes)
cdef bint handleMeta(self) except? -1
cdef bint handlePossibleStartTag(self) except? -1
cdef bint handlePossibleEndTag(self) except? -1
cdef bint handlePossibleTag(self, bint endTag) except? -1
cdef bint handleOther(self) except? -1
@cython.locals(c=bytes)
cdef tuple getAttribute(self)

cdef class ContentAttrParser(object):
cdef object data
cpdef object parse(self) # this needs to be cpdef for tests

cdef object lookupEncoding(object encoding)
Loading