Uh oh!
There was an error while loading. Please reload this page.
- Notifications
You must be signed in to change notification settings - Fork 34k
gh-120317: Lock around global state in the tokenize module#120318
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Uh oh!
There was an error while loading. Please reload this page.
Changes from all commits
900dd6aabf568a9dfe1745fd07b6814d9afc1093be117d256975f01aca466fa3abb8ab601e5396290f190ddde2fFile filter
Filter by extension
Conversations
Uh oh!
There was an error while loading. Please reload this page.
Jump to
Uh oh!
There was an error while loading. Please reload this page.
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| import io | ||
| import time | ||
| import unittest | ||
| import tokenize | ||
| from functools import partial | ||
| from threading import Thread | ||
| from test.support import threading_helper | ||
| @threading_helper.requires_working_threading() | ||
| class TestTokenize(unittest.TestCase): | ||
Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tested this locally: it crashes spectacularly without the fix :) | ||
| def test_tokenizer_iter(self): | ||
| source = io.StringIO("for _ in a:\n pass") | ||
| it = tokenize._tokenize.TokenizerIter(source.readline, extra_tokens=False) | ||
| tokens = [] | ||
| def next_token(it): | ||
| while True: | ||
| try: | ||
| r = next(it) | ||
| tokens.append(tokenize.TokenInfo._make(r)) | ||
| time.sleep(0.03) | ||
| except StopIteration: | ||
| return | ||
| threads = [] | ||
| for _ in range(5): | ||
| threads.append(Thread(target=partial(next_token, it))) | ||
| for thread in threads: | ||
| thread.start() | ||
| for thread in threads: | ||
| thread.join() | ||
| expected_tokens = [ | ||
| tokenize.TokenInfo(type=1, string='for', start=(1, 0), end=(1, 3), line='for _ in a:\n'), | ||
| tokenize.TokenInfo(type=1, string='_', start=(1, 4), end=(1, 5), line='for _ in a:\n'), | ||
| tokenize.TokenInfo(type=1, string='in', start=(1, 6), end=(1, 8), line='for _ in a:\n'), | ||
| tokenize.TokenInfo(type=1, string='a', start=(1, 9), end=(1, 10), line='for _ in a:\n'), | ||
| tokenize.TokenInfo(type=11, string=':', start=(1, 10), end=(1, 11), line='for _ in a:\n'), | ||
| tokenize.TokenInfo(type=4, string='', start=(1, 11), end=(1, 11), line='for _ in a:\n'), | ||
| tokenize.TokenInfo(type=5, string='', start=(2, -1), end=(2, -1), line=' pass'), | ||
| tokenize.TokenInfo(type=1, string='pass', start=(2, 2), end=(2, 6), line=' pass'), | ||
| tokenize.TokenInfo(type=4, string='', start=(2, 6), end=(2, 6), line=' pass'), | ||
| tokenize.TokenInfo(type=6, string='', start=(2, -1), end=(2, -1), line=' pass'), | ||
| tokenize.TokenInfo(type=0, string='', start=(2, -1), end=(2, -1), line=' pass'), | ||
| ] | ||
| tokens.sort() | ||
| expected_tokens.sort() | ||
| self.assertListEqual(tokens, expected_tokens) | ||
| if __name__ == "__main__": | ||
| unittest.main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,10 @@ | ||
| #include "Python.h" | ||
| #include "errcode.h" | ||
| #include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION | ||
| #include "../Parser/lexer/state.h" | ||
| #include "../Parser/lexer/lexer.h" | ||
| #include "../Parser/tokenizer/tokenizer.h" | ||
| #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() | ||
| #include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset() | ||
| static struct PyModuleDef _tokenizemodule; | ||
| @@ -84,14 +85,16 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline, | ||
| } | ||
| static int | ||
| _tokenizer_error(struct tok_state *tok) | ||
| _tokenizer_error(tokenizeriterobject *it) | ||
| { | ||
| _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it); | ||
| if (PyErr_Occurred()){ | ||
| return -1; | ||
| } | ||
| const char *msg = NULL; | ||
| PyObject* errtype = PyExc_SyntaxError; | ||
| struct tok_state *tok = it->tok; | ||
| switch (tok->done){ | ||
| case E_TOKEN: | ||
| msg = "invalid token" | ||
| @@ -177,17 +180,78 @@ _tokenizer_error(struct tok_state *tok) | ||
| return result; | ||
| } | ||
| static PyObject * | ||
| _get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size, | ||
| int *line_changed) | ||
| { | ||
lysnikolaou marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it); | ||
| PyObject *line; | ||
| if (it->tok->lineno != it->last_lineno){ | ||
| // Line has changed since last token, so we fetch the new line and cache it | ||
| // in the iter object. | ||
| Py_XDECREF(it->last_line); | ||
| line = PyUnicode_DecodeUTF8(line_start, size, "replace"); | ||
| it->last_line = line; | ||
| it->byte_col_offset_diff = 0; | ||
| } | ||
| else{ | ||
| line = it->last_line; | ||
| *line_changed = 0; | ||
| } | ||
| return line; | ||
| } | ||
| static void | ||
| _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start, | ||
| PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno, | ||
| Py_ssize_t *col_offset, Py_ssize_t *end_col_offset) | ||
| { | ||
| _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it); | ||
| Py_ssize_t byte_offset = -1; | ||
| if (token.start != NULL && token.start >= line_start){ | ||
| byte_offset = token.start - line_start; | ||
| if (line_changed){ | ||
| *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset); | ||
| it->byte_col_offset_diff = byte_offset - *col_offset; | ||
| } | ||
| else{ | ||
| *col_offset = byte_offset - it->byte_col_offset_diff; | ||
| } | ||
| } | ||
| if (token.end != NULL && token.end >= it->tok->line_start){ | ||
| Py_ssize_t end_byte_offset = token.end - it->tok->line_start; | ||
| if (lineno == end_lineno){ | ||
| // If the whole token is at the same line, we can just use the token.start | ||
| // buffer for figuring out the new column offset, since using line is not | ||
| // performant for very long lines. | ||
| Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset); | ||
lysnikolaou marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page.
lysnikolaou marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| *end_col_offset = *col_offset + token_col_offset; | ||
| it->byte_col_offset_diff += token.end - token.start - token_col_offset; | ||
| } | ||
| else{ | ||
| *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset); | ||
| it->byte_col_offset_diff += end_byte_offset - *end_col_offset; | ||
| } | ||
| } | ||
| it->last_lineno = lineno; | ||
| it->last_end_lineno = end_lineno; | ||
| } | ||
| static PyObject * | ||
| tokenizeriter_next(tokenizeriterobject *it) | ||
| { | ||
| PyObject* result = NULL; | ||
| Py_BEGIN_CRITICAL_SECTION(it); | ||
lysnikolaou marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| struct token token; | ||
| _PyToken_Init(&token); | ||
| int type = _PyTokenizer_Get(it->tok, &token); | ||
| if (type == ERRORTOKEN){ | ||
| if(!PyErr_Occurred()){ | ||
| _tokenizer_error(it->tok); | ||
| _tokenizer_error(it); | ||
| assert(PyErr_Occurred()); | ||
| } | ||
| goto exit; | ||
| @@ -224,18 +288,7 @@ tokenizeriter_next(tokenizeriterobject *it) | ||
| size -= 1; | ||
| } | ||
| if (it->tok->lineno != it->last_lineno){ | ||
| // Line has changed since last token, so we fetch the new line and cache it | ||
| // in the iter object. | ||
| Py_XDECREF(it->last_line); | ||
| line = PyUnicode_DecodeUTF8(line_start, size, "replace"); | ||
| it->last_line = line; | ||
| it->byte_col_offset_diff = 0; | ||
| } else{ | ||
| // Line hasn't changed so we reuse the cached one. | ||
| line = it->last_line; | ||
| line_changed = 0; | ||
| } | ||
| line = _get_current_line(it, line_start, size, &line_changed); | ||
| } | ||
| if (line == NULL){ | ||
| Py_DECREF(str); | ||
| @@ -244,36 +297,10 @@ tokenizeriter_next(tokenizeriterobject *it) | ||
| Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; | ||
| Py_ssize_t end_lineno = it->tok->lineno; | ||
| it->last_lineno = lineno; | ||
| it->last_end_lineno = end_lineno; | ||
| Py_ssize_t col_offset = -1; | ||
| Py_ssize_t end_col_offset = -1; | ||
| Py_ssize_t byte_offset = -1; | ||
| if (token.start != NULL && token.start >= line_start){ | ||
| byte_offset = token.start - line_start; | ||
| if (line_changed){ | ||
| col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset); | ||
| it->byte_col_offset_diff = byte_offset - col_offset; | ||
| } | ||
| else{ | ||
| col_offset = byte_offset - it->byte_col_offset_diff; | ||
| } | ||
| } | ||
| if (token.end != NULL && token.end >= it->tok->line_start){ | ||
| Py_ssize_t end_byte_offset = token.end - it->tok->line_start; | ||
| if (lineno == end_lineno){ | ||
| // If the whole token is at the same line, we can just use the token.start | ||
| // buffer for figuring out the new column offset, since using line is not | ||
| // performant for very long lines. | ||
| Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset); | ||
| end_col_offset = col_offset + token_col_offset; | ||
| it->byte_col_offset_diff += token.end - token.start - token_col_offset; | ||
| } else{ | ||
| end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset); | ||
| it->byte_col_offset_diff += end_byte_offset - end_col_offset; | ||
| } | ||
| } | ||
| _get_col_offsets(it, token, line_start, line, line_changed, | ||
| lineno, end_lineno, &col_offset, &end_col_offset); | ||
| if (it->tok->tok_extra_tokens){ | ||
| if (is_trailing_token){ | ||
| @@ -315,6 +342,8 @@ tokenizeriter_next(tokenizeriterobject *it) | ||
| if (type == ENDMARKER){ | ||
| it->done = 1; | ||
lysnikolaou marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| } | ||
| Py_END_CRITICAL_SECTION(); | ||
| return result; | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.