pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/python/cpython/commit/a24676bedcd332dd7e6fa5521d0449206391d190

Add tests for the C tokenizer and expose it as a private module (GH-2… · python/cpython@a24676b · GitHub
Skip to content

Commit a24676b

Browse files
authored
Add tests for the C tokenizer and expose it as a private module (GH-27924)
1 parent 9ed5231 commit a24676b

9 files changed

Lines changed: 1114 additions & 5 deletions

File tree

Lib/test/test_tokenize.py

Lines changed: 861 additions & 2 deletions
Large diffs are not rendered by default.

Lib/tokenize.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,5 +680,13 @@ def error(message, filename=None, location=None):
680680
perror("unexpected error: %s" % err)
681681
raise
682682

683+
def _generate_tokens_from_c_tokenizer(source):
684+
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
685+
import _tokenize as c_tokenizer
686+
for info in c_tokenizer.TokenizerIter(source):
687+
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
688+
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
689+
690+
683691
if __name__ == "__main__":
684692
main()

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ PARSER_HEADERS= \
339339
PYTHON_OBJS= \
340340
Python/_warnings.o \
341341
Python/Python-ast.o \
342+
Python/Python-tokenize.o \
342343
Python/asdl.o \
343344
Python/ast.o \
344345
Python/ast_opt.o \

Modules/config.c.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ extern PyObject* PyMarshal_Init(void);
2828
extern PyObject* PyInit__imp(void);
2929
extern PyObject* PyInit_gc(void);
3030
extern PyObject* PyInit__ast(void);
31+
extern PyObject* PyInit__tokenize(void);
3132
extern PyObject* _PyWarnings_Init(void);
3233
extern PyObject* PyInit__string(void);
3334

@@ -44,6 +45,9 @@ struct _inittab _PyImport_Inittab[] = {
4445
/* This lives in Python/Python-ast.c */
4546
{"_ast", PyInit__ast},
4647

48+
/* This lives in Python/Python-tokenizer.c */
49+
{"_tokenize", PyInit__tokenize},
50+
4751
/* These entries are here for sys.builtin_module_names */
4852
{"builtins", NULL},
4953
{"sys", NULL},

PC/config.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,8 @@ extern PyObject* _PyWarnings_Init(void);
7272
extern PyObject* PyInit__string(void);
7373
extern PyObject* PyInit__stat(void);
7474
extern PyObject* PyInit__opcode(void);
75-
7675
extern PyObject* PyInit__contextvars(void);
77-
76+
extern PyObject* PyInit__tokenize(void);
7877

7978
/* tools/freeze/makeconfig.py marker for additional "extern" */
8079
/* -- ADDMODULE MARKER 1 -- */
@@ -83,7 +82,6 @@ extern PyObject* PyMarshal_Init(void);
8382
extern PyObject* PyInit__imp(void);
8483

8584
struct _inittab _PyImport_Inittab[] = {
86-
8785
{"_abc", PyInit__abc},
8886
{"array", PyInit_array},
8987
{"_ast", PyInit__ast},
@@ -105,6 +103,7 @@ struct _inittab _PyImport_Inittab[] = {
105103
{"_blake2", PyInit__blake2},
106104
{"time", PyInit_time},
107105
{"_thread", PyInit__thread},
106+
{"_tokenize", PyInit__tokenize},
108107
{"_typing", PyInit__typing},
109108
{"_statistics", PyInit__statistics},
110109
#ifdef WIN32

PCbuild/pythoncore.vcxproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,7 @@
488488
<ClCompile Include="..\Python\pystrtod.c" />
489489
<ClCompile Include="..\Python\dtoa.c" />
490490
<ClCompile Include="..\Python\Python-ast.c" />
491+
<ClCompile Include="..\Python\Python-tokenize.c" />
491492
<ClCompile Include="..\Python\pythonrun.c" />
492493
<ClCompile Include="..\Python\specialize.c" />
493494
<ClCompile Include="..\Python\suggestions.c" />

Python/Python-tokenize.c

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#include "Python.h"
2+
#include "../Parser/tokenizer.h"
3+
4+
static struct PyModuleDef _tokenizemodule;
5+
6+
typedef struct {
7+
PyTypeObject* TokenizerIter;
8+
} tokenize_state;
9+
10+
static tokenize_state*
11+
get_tokenize_state(PyObject* module)
12+
{
13+
return (tokenize_state*)PyModule_GetState(module);
14+
}
15+
16+
#define _tokenize_get_state_by_type(type) \
17+
get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
18+
19+
#include "clinic/Python-tokenize.c.h"
20+
21+
/*[clinic input]
22+
module _tokenizer
23+
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
24+
[clinic start generated code]*/
25+
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
26+
27+
typedef struct {
28+
PyObject_HEAD
29+
struct tok_state* tok;
30+
} tokenizeriterobject;
31+
32+
/*[clinic input]
33+
@classmethod
34+
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
35+
36+
source: str
37+
[clinic start generated code]*/
38+
39+
static PyObject *
40+
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
41+
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
42+
{
43+
tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
44+
if (self == NULL) {
45+
return NULL;
46+
}
47+
PyObject* filename = PyUnicode_FromString("<string>");
48+
if (filename == NULL) {
49+
return NULL;
50+
}
51+
self->tok = PyTokenizer_FromUTF8(source, 1);
52+
if (self->tok == NULL) {
53+
return NULL;
54+
}
55+
self->tok->filename = filename;
56+
return (PyObject*)self;
57+
}
58+
59+
static PyObject*
60+
tokenizeriter_next(tokenizeriterobject* it)
61+
{
62+
const char* start;
63+
const char* end;
64+
int type = PyTokenizer_Get(it->tok, &start, &end);
65+
if (type == ERRORTOKEN && PyErr_Occurred()) {
66+
return NULL;
67+
}
68+
if (type == ERRORTOKEN || type == ENDMARKER) {
69+
PyErr_SetString(PyExc_StopIteration, "EOF");
70+
return NULL;
71+
}
72+
PyObject* str = NULL;
73+
if (start == NULL || end == NULL) {
74+
str = PyUnicode_FromString("");
75+
} else {
76+
str = PyUnicode_FromStringAndSize(start, end - start);
77+
}
78+
if (str == NULL) {
79+
return NULL;
80+
}
81+
82+
Py_ssize_t size = it->tok->inp - it->tok->buf;
83+
PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
84+
if (line == NULL) {
85+
Py_DECREF(str);
86+
return NULL;
87+
}
88+
const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
89+
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
90+
int end_lineno = it->tok->lineno;
91+
int col_offset = -1;
92+
int end_col_offset = -1;
93+
if (start != NULL && start >= line_start) {
94+
col_offset = (int)(start - line_start);
95+
}
96+
if (end != NULL && end >= it->tok->line_start) {
97+
end_col_offset = (int)(end - it->tok->line_start);
98+
}
99+
100+
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
101+
}
102+
103+
static void
104+
tokenizeriter_dealloc(tokenizeriterobject* it)
105+
{
106+
PyTypeObject* tp = Py_TYPE(it);
107+
PyTokenizer_Free(it->tok);
108+
tp->tp_free(it);
109+
Py_DECREF(tp);
110+
}
111+
112+
static PyType_Slot tokenizeriter_slots[] = {
113+
{Py_tp_new, tokenizeriter_new},
114+
{Py_tp_dealloc, tokenizeriter_dealloc},
115+
{Py_tp_getattro, PyObject_GenericGetAttr},
116+
{Py_tp_iter, PyObject_SelfIter},
117+
{Py_tp_iternext, tokenizeriter_next},
118+
{0, NULL},
119+
};
120+
121+
static PyType_Spec tokenizeriter_spec = {
122+
.name = "_tokenize.TokenizerIter",
123+
.basicsize = sizeof(tokenizeriterobject),
124+
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
125+
.slots = tokenizeriter_slots,
126+
};
127+
128+
129+
static int
130+
tokenizemodule_exec(PyObject* m)
131+
{
132+
tokenize_state* state = get_tokenize_state(m);
133+
if (state == NULL) {
134+
return -1;
135+
}
136+
137+
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
138+
m, &tokenizeriter_spec, NULL);
139+
if (state->TokenizerIter == NULL) {
140+
return -1;
141+
}
142+
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
143+
return -1;
144+
}
145+
146+
return 0;
147+
}
148+
149+
static PyMethodDef tokenize_methods[] = {
150+
{NULL, NULL, 0, NULL} /* Sentinel */
151+
};
152+
153+
static PyModuleDef_Slot tokenizemodule_slots[] = {
154+
{Py_mod_exec, tokenizemodule_exec},
155+
{0, NULL}
156+
};
157+
158+
static int
159+
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
160+
{
161+
tokenize_state *state = get_tokenize_state(m);
162+
Py_VISIT(state->TokenizerIter);
163+
return 0;
164+
}
165+
166+
static int
167+
tokenizemodule_clear(PyObject *m)
168+
{
169+
tokenize_state *state = get_tokenize_state(m);
170+
Py_CLEAR(state->TokenizerIter);
171+
return 0;
172+
}
173+
174+
static void
175+
tokenizemodule_free(void *m)
176+
{
177+
tokenizemodule_clear((PyObject *)m);
178+
}
179+
180+
static struct PyModuleDef _tokenizemodule = {
181+
PyModuleDef_HEAD_INIT,
182+
.m_name = "_tokenize",
183+
.m_size = sizeof(tokenize_state),
184+
.m_slots = tokenizemodule_slots,
185+
.m_methods = tokenize_methods,
186+
.m_traverse = tokenizemodule_traverse,
187+
.m_clear = tokenizemodule_clear,
188+
.m_free = tokenizemodule_free,
189+
};
190+
191+
PyMODINIT_FUNC
192+
PyInit__tokenize(void)
193+
{
194+
return PyModuleDef_Init(&_tokenizemodule);
195+
}

Python/clinic/Python-tokenize.c.h

Lines changed: 41 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/stdlib_module_names.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
8080
"_thread",
8181
"_threading_local",
8282
"_tkinter",
83+
"_tokenize",
8384
"_tracemalloc",
8485
"_typing",
8586
"_uuid",

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy