python · pablogsal · May 28, 2023 · May 27, 2023 · May 27, 2023 · May 27, 2023
@@ -85,11 +85,29 @@ def test_basic(self):
  DEDENT '' (5, 0) (5, 0)
  """)
 
- self.check_tokenize("foo='bar'\r\n", """\
- NAME 'foo' (1, 0) (1, 3)
- OP '=' (1, 3) (1, 4)
- STRING "'bar'" (1, 4) (1, 9)
- NEWLINE '\\n' (1, 9) (1, 10)
+ self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\
+ NAME 'if' (1, 0) (1, 2)
+ NAME 'True' (1, 3) (1, 7)
+ OP ':' (1, 7) (1, 8)
+ NEWLINE '\\r\\n' (1, 8) (1, 10)
+ COMMENT '# NL' (2, 4) (2, 8)
+ NL '\\r\\n' (2, 8) (2, 10)
+ INDENT ' ' (3, 0) (3, 4)
+ NAME 'foo' (3, 4) (3, 7)
+ OP '=' (3, 7) (3, 8)
+ STRING "\'bar\'" (3, 8) (3, 13)
+ NEWLINE '\\r\\n' (3, 13) (3, 15)
+ NL '\\r\\n' (4, 0) (4, 2)
+ DEDENT '' (5, 0) (5, 0)
+ """)
+
+ self.check_tokenize("x = 1 + \\\r\n1\r\n", """\
+ NAME 'x' (1, 0) (1, 1)
+ OP '=' (1, 2) (1, 3)
+ NUMBER '1' (1, 4) (1, 5)
+ OP '+' (1, 6) (1, 7)
+ NUMBER '1' (2, 0) (2, 1)
+ NEWLINE '\\r\\n' (2, 1) (2, 3)
  """)
 
  indent_error_file = b"""\
@@ -1784,9 +1802,9 @@ def test_random_files(self):
  if support.verbose >= 2:
  print('tokenize', testfile)
  with open(testfile, 'rb') as f:
-# with self.subTest(file=testfile):
- self.check_roundtrip(f)
- self.check_line_extraction(f)
+ with self.subTest(file=testfile):
+self.check_roundtrip(f)
+self.check_line_extraction(f)
 
 
  def roundtrip(self, code):
@@ -2084,6 +2102,10 @@ def test_string(self):
 b\
 c"""', """\
  STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
+ """)
+
+ self.check_tokenize(r'"hola\\\r\ndfgf"', """\
+ STRING \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16)
  """)
 
  self.check_tokenize('f"abc"', """\
@@ -2120,6 +2142,12 @@ def test_string(self):
  FSTRING_START 'Rf"' (1, 0) (1, 3)
  FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3)
  FSTRING_END '"' (2, 3) (2, 4)
+ """)
+
+ self.check_tokenize(r'f"hola\\\r\ndfgf"', """\
+ FSTRING_START \'f"\' (1, 0) (1, 2)
+ FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16)
+ FSTRING_END \'"\' (1, 16) (1, 17)
  """)
 
  def test_function(self):

diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst
@@ -0,0 +1 @@
+Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez.
@@ -924,9 +924,9 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
 
  struct tok_state *tok;
  if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE){
- tok = _PyTokenizer_FromUTF8(str, exec_input);
+ tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
  } else{
- tok = _PyTokenizer_FromString(str, exec_input);
+ tok = _PyTokenizer_FromString(str, exec_input, 0);
  }
  if (tok == NULL){
  if (PyErr_Occurred()){

@@ -772,7 +772,8 @@ translate_into_utf8(const char* str, const char* enc){
 
 
 static char *
-translate_newlines(const char *s, int exec_input, struct tok_state *tok){
+translate_newlines(const char *s, int exec_input, int preserve_crlf,
+ struct tok_state *tok){
  int skip_next_lf = 0;
  size_t needed_length = strlen(s) + 2, final_length;
  char *buf, *current;
@@ -792,7 +793,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok){
  break;
  }
  }
- if (c == '\r'){
+ if (!preserve_crlf && c == '\r'){
  skip_next_lf = 1;
  c = '\n'
  }
@@ -822,14 +823,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok){
  inside TOK. */
 
 static char *
-decode_str(const char *input, int single, struct tok_state *tok)
+decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
 {
  PyObject* utf8 = NULL;
  char *str;
  const char *s;
  const char *newl[2] ={NULL, NULL};
  int lineno = 0;
- tok->input = str = translate_newlines(input, single, tok);
+ tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
  if (str == NULL)
  return NULL;
  tok->enc = NULL;
@@ -881,14 +882,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
 /* Set up tokenizer for string */
 
 struct tok_state *
-_PyTokenizer_FromString(const char *str, int exec_input)
+_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
 {
  struct tok_state *tok = tok_new();
  char *decoded;
 
  if (tok == NULL)
  return NULL;
- decoded = decode_str(str, exec_input, tok);
+ decoded = decode_str(str, exec_input, tok, preserve_crlf);
  if (decoded == NULL){
  _PyTokenizer_Free(tok);
  return NULL;
@@ -902,13 +903,13 @@ _PyTokenizer_FromString(const char *str, int exec_input)
 /* Set up tokenizer for UTF-8 string */
 
 struct tok_state *
-_PyTokenizer_FromUTF8(const char *str, int exec_input)
+_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
 {
  struct tok_state *tok = tok_new();
  char *translated;
  if (tok == NULL)
  return NULL;
- tok->input = translated = translate_newlines(str, exec_input, tok);
+ tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
  if (translated == NULL){
  _PyTokenizer_Free(tok);
  return NULL;
@@ -1050,7 +1051,7 @@ tok_underflow_interactive(struct tok_state *tok){
  }
  char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
  if (newtok != NULL){
- char *translated = translate_newlines(newtok, 0, tok);
+ char *translated = translate_newlines(newtok, 0, 0, tok);
  PyMem_Free(newtok);
  if (translated == NULL){
  return 0;
@@ -1594,6 +1595,9 @@ tok_decimal_tail(struct tok_state *tok)
 static inline int
 tok_continuation_line(struct tok_state *tok){
  int c = tok_nextc(tok);
+ if (c == '\r'){
+ c = tok_nextc(tok);
+ }
  if (c != '\n'){
  tok->done = E_LINECONT;
  return -1;
@@ -1693,7 +1697,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
  }
  }
  tok_backup(tok, c);
- if (c == '#' || c == '\n'){
+ if (c == '#' || c == '\n' || c == '\r'){
  /* Lines with only whitespace and/or comments
  shouldn't affect the indentation and are
  not passed to the parser as NEWLINE tokens,
@@ -1822,7 +1826,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
  const char *prefix, *type_start;
  int current_starting_col_offset;
 
- while (c != EOF && c != '\n'){
+ while (c != EOF && c != '\n' && c != '\r'){
  c = tok_nextc(tok);
  }
 
@@ -2002,6 +2006,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
  return MAKE_TOKEN(NAME);
  }
 
+ if (c == '\r'){
+ c = tok_nextc(tok);
+ }
+
  /* Newline */
  if (c == '\n'){
  tok->atbol = 1;
@@ -2405,7 +2413,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
  else{
  end_quote_size = 0;
  if (c == '\\'){
- tok_nextc(tok); /* skip escaped char */
+ c = tok_nextc(tok); /* skip escaped char */
+ if (c == '\r'){
+ c = tok_nextc(tok);
+ }
  }
  }
  }
@@ -2696,6 +2707,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
  return MAKE_TOKEN(FSTRING_MIDDLE);
  } else if (c == '\\'){
  int peek = tok_nextc(tok);
+ if (peek == '\r'){
+ peek = tok_nextc(tok);
+ }
  // Special case when the backslash is right before a curly
  // brace. We have to restore and return the control back
  // to the loop for the next iteration.

@@ -135,8 +135,8 @@ struct tok_state{
 #endif
 };
 
-extern struct tok_state *_PyTokenizer_FromString(const char *, int);
-extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
+extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
+extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
 extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
  const char *, const char *);
 extern void _PyTokenizer_Free(struct tok_state *);

diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
@@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
  if (filename == NULL){
  return NULL;
  }
- self->tok = _PyTokenizer_FromUTF8(source, 1);
+ self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
  if (self->tok == NULL){
  Py_DECREF(filename);
  return NULL;
@@ -240,7 +240,12 @@ tokenizeriter_next(tokenizeriterobject *it)
  type = NAME;
  }
  else if (type == NEWLINE){
- str = PyUnicode_FromString("\n");
+ Py_DECREF(str);
+ if (it->tok->start[0] == '\r'){
+ str = PyUnicode_FromString("\r\n");
+ } else{
+ str = PyUnicode_FromString("\n");
+ }
  end_col_offset++;
  }
  }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez.