From e13814d9678f7821d49619d89532a680b271e439 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 21:37:40 +1000
Subject: [PATCH 01/27] Drop support for EOL OSX <10.9

---
 setup.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index 2770abe..9c7dd48 100755
--- a/setup.py
+++ b/setup.py
@@ -7,16 +7,6 @@
 from setuptools import setup
 import sys
 
-lxml_requirement = "lxml"
-if sys.platform == "darwin":
-    import platform
-
-    mac_ver = platform.mac_ver()[0]
-    mac_major, mac_minor = mac_ver.split('.')[:2]
-    if int(mac_major) == 10 and int(mac_minor) < 9:
-        print("Using lxml<2.4")
-        lxml_requirement = "lxml<2.4"
-
 speed_deps = [
      "cchardet",
 ]
@@ -60,7 +50,7 @@ def find_version(*file_paths):
     license="Apache License 2.0",
     url="http://github.com/buriy/python-readability",
     packages=["readability", "readability.compat"],
-    install_requires=["chardet", lxml_requirement, "cssselect"],
+    install_requires=["chardet", "lxml", "cssselect"],
     tests_require=test_deps,
     extras_require=extras,
     classifiers=[

From 4ebadbdace5153ab1e3a2a60dd010f6e2626cd22 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 21:39:44 +1000
Subject: [PATCH 02/27] Add missing classifiers for supported python versions

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9c7dd48..8f6e39a 100755
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,6 @@
 import os
 import re
 from setuptools import setup
-import sys
 
 speed_deps = [
      "cchardet",
@@ -69,5 +68,8 @@ def find_version(*file_paths):
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: Implementation :: PyPy",
     ],
 )

From 7fae6223e12709d5bf22cf929e73a22b5565ef44 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 21:43:26 +1000
Subject: [PATCH 03/27] Fix support for lxml>=5.2.0

---
 setup.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 8f6e39a..cca17d4 100755
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,12 @@ def find_version(*file_paths):
     license="Apache License 2.0",
     url="http://github.com/buriy/python-readability",
     packages=["readability", "readability.compat"],
-    install_requires=["chardet", "lxml", "cssselect"],
+    install_requires=[
+        "chardet",
+        "lxml[html_clean]",
+        "lxml-html-clean; python_version < '3.11'",
+        "cssselect"
+    ],
     tests_require=test_deps,
     extras_require=extras,
     classifiers=[

From 2c90062c4c96baba7d8020ae921c63f2a41fa3d3 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 21:51:50 +1000
Subject: [PATCH 04/27] Drop support for EOL Python 2.7

---
 .travis.yml                    |  7 ----
 doc/source/conf.py             |  1 -
 readability/compat/__init__.py | 26 --------------
 readability/compat/three.py    |  6 ----
 readability/compat/two.py      |  6 ----
 readability/encoding.py        |  9 +++--
 readability/htmls.py           | 19 +++++-----
 readability/readability.py     | 63 ++++++++++++----------------------
 setup.py                       |  5 +--
 tox.ini                        |  2 +-
 10 files changed, 36 insertions(+), 108 deletions(-)
 delete mode 100644 readability/compat/__init__.py
 delete mode 100644 readability/compat/three.py
 delete mode 100644 readability/compat/two.py

diff --git a/.travis.yml b/.travis.yml
index 21e1ce1..cab9e23 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,9 +4,6 @@ cache: pip
 
 matrix:
   include:
-    - name: "Python 2.7 on Linux"
-      python: 2.7
-      env: PIP=pip
     - name: "Python 3.5 on Linux"
       python: 3.5
     - name: "Python 3.6 on Linux"
@@ -19,9 +16,6 @@ matrix:
     - name: "Python 3.9 Nightly on Linux"
       dist: bionic
       python: nightly
-    - name: "Pypy on Linux"
-      python: pypy
-      env: PIP=pip
     - name: "Pypy 3 on Linux"
       python: pypy3
     - name: "Python 3.7 on older macOS"
@@ -44,7 +38,6 @@ matrix:
         - pip3 --version
   allow_failures:
     - python: nightly
-    - python: pypy
     - python: pypy3
     - os: osx
 
diff --git a/doc/source/conf.py b/doc/source/conf.py
index bb26134..a099772 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 #
 # readability documentation build configuration file, created by
 # sphinx-quickstart on Thu Mar 23 16:29:38 2017.
diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py
deleted file mode 100644
index caf0ea8..0000000
--- a/readability/compat/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""
-This module contains compatibility helpers for Python 2/3 interoperability.
-
-It mainly exists because their are certain incompatibilities in the Python
-syntax that can only be solved by conditionally importing different functions.
-"""
-import sys
-from lxml.etree import tostring
-
-if sys.version_info[0] == 2:
-    bytes_ = str
-    str_ = unicode
-    def tostring_(s):
-        return tostring(s, encoding='utf-8').decode('utf-8')
-
-elif sys.version_info[0] == 3:
-    bytes_ = bytes
-    str_ = str
-    def tostring_(s):
-        return tostring(s, encoding='utf-8')
-
-
-try:
-    from re import Pattern as pattern_type
-except ImportError:
-    from re import _pattern_type as pattern_type
diff --git a/readability/compat/three.py b/readability/compat/three.py
deleted file mode 100644
index 2635157..0000000
--- a/readability/compat/three.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def raise_with_traceback(exc_type, traceback, *args, **kwargs):
-    """
-    Raise a new exception of type `exc_type` with an existing `traceback`. All
-    additional (keyword-)arguments are forwarded to `exc_type`
-    """
-    raise exc_type(*args, **kwargs).with_traceback(traceback)
diff --git a/readability/compat/two.py b/readability/compat/two.py
deleted file mode 100644
index 642ecb7..0000000
--- a/readability/compat/two.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def raise_with_traceback(exc_type, traceback, *args, **kwargs):
-    """
-    Raise a new exception of type `exc_type` with an existing `traceback`. All
-    additional (keyword-)arguments are forwarded to `exc_type`
-    """
-    raise exc_type(*args, **kwargs), None, traceback
diff --git a/readability/encoding.py b/readability/encoding.py
index 212ff92..c95cc14 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -39,11 +39,10 @@ def get_encoding(page):
     for declared_encoding in declared_encodings:
         try:
             # Python3 only
-            if sys.version_info[0] == 3:
-                # declared_encoding will actually be bytes but .decode() only
-                # accepts `str` type. Decode blindly with ascii because no one should
-                # ever use non-ascii characters in the name of an encoding.
-                declared_encoding = declared_encoding.decode("ascii", "replace")
+            # declared_encoding will actually be bytes but .decode() only
+            # accepts `str` type. Decode blindly with ascii because no one should
+            # ever use non-ascii characters in the name of an encoding.
+            declared_encoding = declared_encoding.decode("ascii", "replace")
 
             encoding = fix_charset(declared_encoding)
             # Now let's decode the page
diff --git a/readability/htmls.py b/readability/htmls.py
index acacb5a..87299f5 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -4,13 +4,12 @@
 
 from .cleaners import normalize_spaces, clean_attributes
 from .encoding import get_encoding
-from .compat import str_
 
 utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
 
 
 def build_doc(page):
-    if isinstance(page, str_):
+    if isinstance(page, str):
         encoding = None
         decoded_page = page
     else:
@@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl):
 
 def normalize_entities(cur_title):
     entities = {
-        u"\u2014": "-",
-        u"\u2013": "-",
-        u"&mdash;": "-",
-        u"&ndash;": "-",
-        u"\u00A0": " ",
-        u"\u00AB": '"',
-        u"\u00BB": '"',
-        u"&quot;": '"',
+        "\u2014": "-",
+        "\u2013": "-",
+        "&mdash;": "-",
+        "&ndash;": "-",
+        "\u00A0": " ",
+        "\u00AB": '"',
+        "\u00BB": '"',
+        "&quot;": '"',
     }
     for c, r in entities.items():
         if c in cur_title:
diff --git a/readability/readability.py b/readability/readability.py
index f16b170..5fc8b32 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -1,9 +1,12 @@
 #!/usr/bin/env python
-from __future__ import print_function
 import logging
 import re
 import sys
+import urllib.request
+import urllib.parse
+import urllib.error
 
+from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.etree import _ElementTree
 from lxml.html import document_fromstring
@@ -17,7 +20,6 @@
 from .htmls import get_title
 from .htmls import get_author
 from .htmls import shorten_title
-from .compat import str_, bytes_, tostring_, pattern_type
 from .debug import describe, text_content
 
 
@@ -80,14 +82,14 @@ def text_length(i):
 def compile_pattern(elements):
     if not elements:
         return None
-    elif isinstance(elements, pattern_type):
+    elif isinstance(elements, re.Pattern):
         return elements
-    elif isinstance(elements, (str_, bytes_)):
-        if isinstance(elements, bytes_):
-            elements = str_(elements, "utf-8")
-        elements = elements.split(u",")
+    elif isinstance(elements, (str, bytes)):
+        if isinstance(elements, bytes):
+            elements = str(elements, "utf-8")
+        elements = elements.split(",")
     if isinstance(elements, (list, tuple)):
-        return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U)
+        return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U)
     else:
         raise Exception("Unknown type for the pattern: {}".format(type(elements)))
         # assume string or string like object
@@ -242,19 +244,15 @@ def summary(self, html_partial=False):
                         log.info("ruthless removal did not work. ")
                         ruthless = False
                         log.debug(
-                            (
                                 "ended up stripping too much - "
                                 "going for a safer _parse"
-                            )
                         )
                         # try again
                         continue
                     else:
                         log.debug(
-                            (
                                 "Ruthless and lenient parsing did not work. "
                                 "Returning raw html"
-                            )
                         )
                         article = self.html.find("body")
                         if article is None:
@@ -272,11 +270,7 @@ def summary(self, html_partial=False):
                     return cleaned_article
         except Exception as e:
             log.exception("error getting summary: ")
-            if sys.version_info[0] == 2:
-                from .compat.two import raise_with_traceback
-            else:
-                from .compat.three import raise_with_traceback
-            raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e))
+            raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
 
     def get_article(self, candidates, best_candidate, html_partial=False):
         # Now that we have the top candidate, look through its siblings for
@@ -474,7 +468,8 @@ def transform_misused_divs_into_paragraphs(self):
             # This results in incorrect results in case there is an <img>
             # buried within an <a> for example
             if not REGEXES["divToPElementsRe"].search(
-                str_(b"".join(map(tostring_, list(elem))))
+                str(b"".join(tostring(s, encoding='utf-8') for s in elem))
+                # str(b"".join(map(tostring_, list(elem))))
             ):
                 # log.debug("Altering %s to p" % (describe(elem)))
                 elem.tag = "p"
@@ -501,13 +496,11 @@ def transform_misused_divs_into_paragraphs(self):
 
     def tags(self, node, *tag_names):
         for tag_name in tag_names:
-            for e in node.findall(".//%s" % tag_name):
-                yield e
+            yield from node.findall(".//%s" % tag_name)
 
     def reverse_tags(self, node, *tag_names):
         for tag_name in tag_names:
-            for e in reversed(node.findall(".//%s" % tag_name)):
-                yield e
+            yield from reversed(node.findall(".//%s" % tag_name))
 
     def sanitize(self, node, candidates):
         MIN_LEN = self.min_text_length
@@ -594,13 +587,13 @@ def sanitize(self, node, candidates):
                     )
                     to_remove = True
                 elif weight < 25 and link_density > 0.2:
-                    reason = "too many links %.3f for its weight %s" % (
+                    reason = "too many links {:.3f} for its weight {}".format(
                         link_density,
                         weight,
                     )
                     to_remove = True
                 elif weight >= 25 and link_density > 0.5:
-                    reason = "too many links %.3f for its weight %s" % (
+                    reason = "too many links {:.3f} for its weight {}".format(
                         link_density,
                         weight,
                     )
@@ -726,18 +719,10 @@ def main():
     file = None
     if options.url:
         headers = {"User-Agent": "Mozilla/5.0"}
-        if sys.version_info[0] == 3:
-            import urllib.request, urllib.parse, urllib.error
-
-            request = urllib.request.Request(options.url, None, headers)
-            file = urllib.request.urlopen(request)
-        else:
-            import urllib2
-
-            request = urllib2.Request(options.url, None, headers)
-            file = urllib2.urlopen(request)
+        request = urllib.request.Request(options.url, None, headers)
+        file = urllib.request.urlopen(request)
     else:
-        file = open(args[0], "rt")
+        file = open(args[0])
     try:
         doc = Document(
             file.read(),
@@ -751,14 +736,8 @@ def main():
             result = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary()
             open_in_browser(result)
         else:
-            enc = (
-                sys.__stdout__.encoding or "utf-8"
-            )  # XXX: this hack could not always work, better to set PYTHONIOENCODING
             result = "Title:" + doc.short_title() + "\n" + doc.summary()
-            if sys.version_info[0] == 3:
-                print(result)
-            else:
-                print(result.encode(enc, "replace"))
+            print(result)
     finally:
         file.close()
 
diff --git a/setup.py b/setup.py
index cca17d4..032c057 100755
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 
-from __future__ import print_function
 import codecs
 import os
 import re
@@ -48,7 +47,7 @@ def find_version(*file_paths):
     long_description_content_type='text/x-rst',
     license="Apache License 2.0",
     url="http://github.com/buriy/python-readability",
-    packages=["readability", "readability.compat"],
+    packages=["readability"],
     install_requires=[
         "chardet",
         "lxml[html_clean]",
@@ -66,8 +65,6 @@ def find_version(*file_paths):
         "Topic :: Internet",
         "Topic :: Software Development :: Libraries :: Python Modules",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 2",
-        "Programming Language :: Python :: 2.7",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
diff --git a/tox.ini b/tox.ini
index d695433..7742484 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 envlist =
-    py{27,35,36,37,38,39,310,py,py3}, doc
+    py{35,36,37,38,39,310,py3}, doc
 skip_missing_interpreters =
     True
 

From 4cf9eedfb3693da4d26c4c02ec316b83951cf020 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 21:57:26 +1000
Subject: [PATCH 05/27] Drop support for EOL Python 3.5

---
 .travis.yml                | 2 --
 readability/readability.py | 6 +++---
 setup.py                   | 1 -
 tox.ini                    | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index cab9e23..9f032c7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,8 +4,6 @@ cache: pip
 
 matrix:
   include:
-    - name: "Python 3.5 on Linux"
-      python: 3.5
     - name: "Python 3.6 on Linux"
       python: 3.6
     - name: "Python 3.7 on Linux"
diff --git a/readability/readability.py b/readability/readability.py
index 5fc8b32..c86e7d1 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -91,7 +91,7 @@ def compile_pattern(elements):
     if isinstance(elements, (list, tuple)):
         return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U)
     else:
-        raise Exception("Unknown type for the pattern: {}".format(type(elements)))
+        raise Exception(f"Unknown type for the pattern: {type(elements)}")
         # assume string or string like object
 
 
@@ -332,7 +332,7 @@ def select_best_candidate(self, candidates):
         )
         for candidate in sorted_candidates[:5]:
             elem = candidate["elem"]
-            log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem)))
+            log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem)))
 
         best_candidate = sorted_candidates[0]
         return best_candidate
@@ -448,7 +448,7 @@ def score_node(self, elem):
 
     def remove_unlikely_candidates(self):
         for elem in self.html.findall(".//*"):
-            s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
+            s = "{} {}".format(elem.get("class", ""), elem.get("id", ""))
             if len(s) < 2:
                 continue
             if (
diff --git a/setup.py b/setup.py
index 032c057..b5d47da 100755
--- a/setup.py
+++ b/setup.py
@@ -66,7 +66,6 @@ def find_version(*file_paths):
         "Topic :: Software Development :: Libraries :: Python Modules",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
diff --git a/tox.ini b/tox.ini
index 7742484..532a20f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 envlist =
-    py{35,36,37,38,39,310,py3}, doc
+    py{36,37,38,39,310,py3}, doc
 skip_missing_interpreters =
     True
 

From 0b01ac6972e19c126e6235c06ed6afdb16712a50 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 21:58:12 +1000
Subject: [PATCH 06/27] Drop support for EOL Python 3.6

---
 .travis.yml | 2 --
 setup.py    | 1 -
 tox.ini     | 2 +-
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9f032c7..6fb4640 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,8 +4,6 @@ cache: pip
 
 matrix:
   include:
-    - name: "Python 3.6 on Linux"
-      python: 3.6
     - name: "Python 3.7 on Linux"
       python: 3.7
     - name: "Python 3.8 on Linux"
diff --git a/setup.py b/setup.py
index b5d47da..dfb5db5 100755
--- a/setup.py
+++ b/setup.py
@@ -66,7 +66,6 @@ def find_version(*file_paths):
         "Topic :: Software Development :: Libraries :: Python Modules",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
diff --git a/tox.ini b/tox.ini
index 532a20f..53d7873 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 envlist =
-    py{36,37,38,39,310,py3}, doc
+    py{38,39,310,py,py3}, doc
 skip_missing_interpreters =
     True
 

From 26f11c05d8e11d826e2d9438c714d4f9bb6a1d43 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 21:59:19 +1000
Subject: [PATCH 07/27] Drop support for EOL Python 3.7

---
 .travis.yml | 20 --------------------
 setup.py    |  1 -
 tox.ini     |  2 +-
 3 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6fb4640..ea56f51 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,8 +4,6 @@ cache: pip
 
 matrix:
   include:
-    - name: "Python 3.7 on Linux"
-      python: 3.7
     - name: "Python 3.8 on Linux"
       dist: xenial
       python: 3.8
@@ -14,24 +12,6 @@ matrix:
       python: nightly
     - name: "Pypy 3 on Linux"
       python: pypy3
-    - name: "Python 3.7 on older macOS"
-      os: osx
-      osx_image: xcode9.4
-      language: shell
-      env: TOXENV=py37
-      before_install:
-        - sw_vers
-        - python3 --version
-        - pip3 --version
-    - name: "Python 3.7 on macOS"
-      os: osx
-      osx_image: xcode11
-      language: shell
-      env: TOXENV=py37
-      before_install:
-        - sw_vers
-        - python3 --version
-        - pip3 --version
   allow_failures:
     - python: nightly
     - python: pypy3
diff --git a/setup.py b/setup.py
index dfb5db5..1a445fb 100755
--- a/setup.py
+++ b/setup.py
@@ -66,7 +66,6 @@ def find_version(*file_paths):
         "Topic :: Software Development :: Libraries :: Python Modules",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
diff --git a/tox.ini b/tox.ini
index 53d7873..b78bd0b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 envlist =
-    py{38,39,310,py,py3}, doc
+    py{38,39,310,py3}, doc
 skip_missing_interpreters =
     True
 

From b34c8d98fde88fe04d39ad241b933ca6a73fc86f Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 22:00:05 +1000
Subject: [PATCH 08/27] Add Python 3.11 to tox matrix

---
 setup.py | 1 +
 tox.ini  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1a445fb..a71fba7 100755
--- a/setup.py
+++ b/setup.py
@@ -69,6 +69,7 @@ def find_version(*file_paths):
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "Programming Language :: Python :: Implementation :: PyPy",
     ],
 )
diff --git a/tox.ini b/tox.ini
index b78bd0b..1bcbb6e 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 envlist =
-    py{38,39,310,py3}, doc
+    py{38,39,310,311,py3}, doc
 skip_missing_interpreters =
     True
 

From 2987875dea3c89e966ee09c393a0015d7b8bb8da Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 22:01:20 +1000
Subject: [PATCH 09/27] Add Python 3.12 to tox matrix

---
 setup.py | 1 +
 tox.ini  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a71fba7..294572d 100755
--- a/setup.py
+++ b/setup.py
@@ -70,6 +70,7 @@ def find_version(*file_paths):
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "Programming Language :: Python :: Implementation :: PyPy",
     ],
 )
diff --git a/tox.ini b/tox.ini
index 1bcbb6e..ff8f68c 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 envlist =
-    py{38,39,310,311,py3}, doc
+    py{38,39,310,311,312,py3}, doc
 skip_missing_interpreters =
     True
 

From 24d97d1591aaf28695fbb2700523669abfcea4f8 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 22:06:51 +1000
Subject: [PATCH 10/27] Update documentation build to use sphinx-build

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index ff8f68c..d4adf61 100644
--- a/tox.ini
+++ b/tox.ini
@@ -30,4 +30,4 @@ commands =
 
 [testenv:doc]
 commands =
-    python setup.py build_sphinx
+    sphinx-build -b html doc/source/ build/

From 2e48d37c1a93048de51c8cd19416b4794697bf32 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 22:07:59 +1000
Subject: [PATCH 11/27] Fix warning during doc build

---
 doc/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/conf.py b/doc/source/conf.py
index a099772..afb13f7 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -71,7 +71,7 @@
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.

From acb2f3d019b31f99bba176b2faa9ed73ef224e2b Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Thu, 15 Aug 2024 22:09:59 +1000
Subject: [PATCH 12/27] Replace deprecated recommonmark with myst-parser

---
 doc/source/conf.py | 2 +-
 tox.ini            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/conf.py b/doc/source/conf.py
index afb13f7..e70cf9b 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -37,7 +37,7 @@
     "sphinx.ext.doctest",
     "sphinx.ext.intersphinx",
     "sphinx.ext.todo",
-    "recommonmark",
+    "myst_parser",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/tox.ini b/tox.ini
index d4adf61..3f03df8 100644
--- a/tox.ini
+++ b/tox.ini
@@ -14,7 +14,7 @@ deps =
     pytest
     doc: sphinx
     doc: sphinx_rtd_theme
-    doc: recommonmark
+    doc: myst-parser
 
 # This creates the virtual envs with --site-packages so already packages
 # that are already installed will be reused. This is especially useful on

From 7fcf70bea765a60a1cf05d1cdea60d3c3f56ab46 Mon Sep 17 00:00:00 2001
From: Daniel Bowring <git@daniel.bowring.email>
Date: Sat, 12 Oct 2024 02:34:48 +1100
Subject: [PATCH 13/27] Add support for python 3.13

---
 setup.py | 1 +
 tox.ini  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 294572d..26894d4 100755
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@ def find_version(*file_paths):
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
         "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
         "Programming Language :: Python :: Implementation :: PyPy",
     ],
 )
diff --git a/tox.ini b/tox.ini
index 3f03df8..926fda5 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@
 
 [tox]
 envlist =
-    py{38,39,310,311,312,py3}, doc
+    py{38,39,310,311,312,313,py3}, doc
 skip_missing_interpreters =
     True
 

From 1986e25df8cda751d30599d676cece6942fabd38 Mon Sep 17 00:00:00 2001
From: botlabsDev <54632107+botlabsDev@users.noreply.github.com>
Date: Sun, 12 Jan 2025 16:31:37 +0100
Subject: [PATCH 14/27] Fix issue #89, introduce flag option to keep images in
 summary.

---
 readability/readability.py                    | 11 +++----
 requirements-dev.txt                          |  2 ++
 .../summary-keep-all-images.sample.html       | 29 +++++++++++++++++++
 tests/test_article_only.py                    | 21 ++++++++++++++
 4 files changed, 58 insertions(+), 5 deletions(-)
 create mode 100644 tests/samples/summary-keep-all-images.sample.html

diff --git a/readability/readability.py b/readability/readability.py
index c86e7d1..286841c 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -210,12 +210,13 @@ def get_clean_html(self):
         """
         return clean_attributes(tounicode(self.html, method="html"))
 
-    def summary(self, html_partial=False):
+    def summary(self, html_partial=False, keep_all_images=False):
         """
         Given a HTML file, extracts the text of the article.
 
         :param html_partial: return only the div of the document, don't wrap
                              in html and body tags.
+        :param keep_all_images: Keep all images in summary.
 
         Warning: It mutates internal DOM representation of the HTML document,
         so it is better to call other API methods before this one.
@@ -257,7 +258,7 @@ def summary(self, html_partial=False):
                         article = self.html.find("body")
                         if article is None:
                             article = self.html
-                cleaned_article = self.sanitize(article, candidates)
+                cleaned_article = self.sanitize(article, candidates, keep_all_images)
 
                 article_length = len(cleaned_article or "")
                 retry_length = self.retry_length
@@ -502,7 +503,7 @@ def reverse_tags(self, node, *tag_names):
         for tag_name in tag_names:
             yield from reversed(node.findall(".//%s" % tag_name))
 
-    def sanitize(self, node, candidates):
+    def sanitize(self, node, candidates, keep_all_images=False):
         MIN_LEN = self.min_text_length
         for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
             if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
@@ -563,8 +564,8 @@ def sanitize(self, node, candidates):
                 to_remove = False
                 reason = ""
 
-                # if el.tag == 'div' and counts["img"] >= 1:
-                #    continue
+                if keep_all_images and el.tag == 'div' and counts["img"] >= 1:
+                    continue
                 if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
                     reason = "too many images (%s)" % counts["img"]
                     to_remove = True
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 4731fa9..6160e33 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,4 +1,6 @@
 lxml
+lxml_html_clean
+pytest
 chardet
 nose
 pep8
diff --git a/tests/samples/summary-keep-all-images.sample.html b/tests/samples/summary-keep-all-images.sample.html
new file mode 100644
index 0000000..127683f
--- /dev/null
+++ b/tests/samples/summary-keep-all-images.sample.html
@@ -0,0 +1,29 @@
+<!DOCTYPE html>
+<html lang="en">
+<head></head>
+<body>
+<h2>
+    <span>
+        H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline
+    </span>
+</h2>
+<p>
+    <spa>
+        Text Text Text Text Text Text Text Text Text Text
+    </spa>
+</p>
+<div>
+    <span>
+        <a>
+            <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAABhGlDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw1AUhU9TpSLVDnYQcchQnSyIFXHUKhShQqgVWnUweekfNDEkKS6OgmvBwZ/FqoOLs64OroIg+APiLjgpukiJ9yWFFjFeeLyP8+45vHcfIDSqTLO6xgFNt81MKinm8iti6BUBhBFBPxIys4xZSUrDt77uqZvqLs6z/Pv+rD61YDEgIBLPMMO0ideJpzZtg/M+cZSVZZX4nHjMpAsSP3Jd8fiNc8llgWdGzWxmjjhKLJY6WOlgVjY14knimKrplC/kPFY5b3HWqjXWuid/YbigLy9xndYwUljAIiSIUFBDBVXYiNOuk2IhQ+dJH/+Q65fIpZCrAkaOeWxAg+z6wf/g92ytYmLCSwonge4Xx/kYAUK7QLPuON/HjtM8AYLPwJXe9m80gOlP0uttLXYERLaBi+u2puwBlzvA4JMhm7IrBWkJxSLwfkbflAcGboHeVW9urXOcPgBZmlX6Bjg4BEZLlL3m8+6ezrn929Oa3w9e03KfJqsuOAAAAAlwSFlzAAAuIwAALiMBeKU/dgAAAAd0SU1FB+kBDA8PKt1W5MYAAAAZdEVYdENvbW1lbnQAQ3JlYXRlZCB3aXRoIEdJTVBXgQ4XAAAAFUlEQVQY02P8x+rFgBswMeAFI1UaAJ65AWFYB2G5AAAAAElFTkSuQmCC"
+            />
+         </a>
+    </span>
+</div>
+<p>
+    <spa>
+        Text Text Text Text Text Text Text Text Text Text
+    </spa>
+</p>
+</body>
+</html>
\ No newline at end of file
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
index c5592cf..1835d9f 100644
--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@@ -133,3 +133,24 @@ def test_author_absent(self):
         sample = load_sample("si-game.sample.html")
         doc = Document(sample)
         assert '[no-author]' == doc.author()
+
+    def test_keep_images_present(self):
+        sample = load_sample("summary-keep-all-images.sample.html")
+
+        doc = Document(sample)
+
+        assert "<img" in doc.summary(keep_all_images=True)
+
+    def test_keep_images_absent(self):
+        sample = load_sample("summary-keep-all-images.sample.html")
+
+        doc = Document(sample)
+
+        assert "<img" not in doc.summary(keep_all_images=False)
+
+    def test_keep_images_absent_by_defautl(self):
+        sample = load_sample("summary-keep-all-images.sample.html")
+
+        doc = Document(sample)
+
+        assert "<img" not in doc.summary()

From 88cc983f444cdb2950dfc431b17923989314f22f Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Mon, 13 Jan 2025 02:05:09 +0700
Subject: [PATCH 15/27] Create python-package.yml

---
 .github/workflows/python-package.yml | 40 ++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/python-package.yml

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 0000000..73784a4
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,40 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest

From efebb4cdd90fdff58b1507534a2b2c72ff7680d6 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Mon, 13 Jan 2025 03:00:49 +0700
Subject: [PATCH 16/27] Removed wrapt decorator

---
 setup.py                   |  7 -------
 tests/test_article_only.py | 30 ++++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 26894d4..1819b41 100755
--- a/setup.py
+++ b/setup.py
@@ -9,14 +9,8 @@
      "cchardet",
 ]
 
-test_deps = [
-    # Test timeouts
-    "wrapt-timeout-decorator",
-]
-
 extras = {
     'speed': speed_deps,
-    'test': test_deps,
 }
 
 # Adapted from https://github.com/pypa/pip/blob/master/setup.py
@@ -54,7 +48,6 @@ def find_version(*file_paths):
         "lxml-html-clean; python_version < '3.11'",
         "cssselect"
     ],
-    tests_require=test_deps,
     extras_require=extras,
     classifiers=[
         "Environment :: Web Environment",
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
index 1835d9f..d6cef52 100644
--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@@ -1,8 +1,34 @@
 import os
+import time
 import unittest
 
 from readability import Document
-from wrapt_timeout_decorator import *
+from functools import wraps
+
+
+class TimeoutException(Exception):
+    """Exception raised when a function exceeds its time limit."""
+    pass
+
+
+def timeout(seconds):
+    """Decorator to enforce a timeout on function execution."""
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            start_time = time.perf_counter()
+            result = func(*args, **kwargs)
+            end_time = time.perf_counter()
+            elapsed_time = end_time - start_time
+            if elapsed_time > seconds:
+                raise TimeoutException(
+                    f"Function '{func.__name__}' exceeded time limit of {seconds} seconds "
+                    f"with an execution time of {elapsed_time:.4f} seconds"
+                )
+            return result
+        return wrapper
+    return decorator
+
 
 SAMPLES = os.path.join(os.path.dirname(__file__), "samples")
 
@@ -100,7 +126,7 @@ def test_correct_cleanup(self):
         assert not "aside" in s
 
     # Many spaces make some regexes run forever
-    @timeout(3, use_signals=False)
+    @timeout(3)
     def test_many_repeated_spaces(self):
         long_space = " " * 1000000
         sample = "<html><body><p>foo" + long_space + "</p></body></html>"

From 956bfbbe46597cdb678ab54e5b1dfada0e26da13 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Mon, 13 Jan 2025 03:02:49 +0700
Subject: [PATCH 17/27] Update python-package.yml

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 73784a4..23f1610 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
     - uses: actions/checkout@v4

From b220919186d0db2a006bbabb910b602056b3f51a Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Thu, 16 Jan 2025 02:48:45 +0700
Subject: [PATCH 18/27] Bump to 0.8.3

---
 readability/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/readability/__init__.py b/readability/__init__.py
index 6a263bf..18dccae 100644
--- a/readability/__init__.py
+++ b/readability/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.8.2"
+__version__ = "0.8.3"
 
 from .readability import Document

From c1574456f5aefc1dc05d7def332c48e3799e214c Mon Sep 17 00:00:00 2001
From: cdhigh <cdhigh@users.noreply.github.com>
Date: Thu, 1 May 2025 10:37:30 -0300
Subject: [PATCH 19/27] shorten_title supports CJK character sets.

---
 readability/htmls.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/readability/htmls.py b/readability/htmls.py
index 87299f5..b090aa5 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -110,29 +110,35 @@ def shorten_title(doc):
             if e.text_content():
                 add_match(candidates, e.text_content(), orig)
 
+    cjk = re.compile('[\u4e00-\u9fff]+')
+
     if candidates:
         title = sorted(candidates, key=len)[-1]
     else:
         for delimiter in [" | ", " - ", " :: ", " / "]:
             if delimiter in title:
                 parts = orig.split(delimiter)
-                if len(parts[0].split()) >= 4:
-                    title = parts[0]
+                p0 = parts[0]
+                pl = parts[-1]
+                if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
+                    title = p0
                     break
-                elif len(parts[-1].split()) >= 4:
-                    title = parts[-1]
+                elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+                    title = p1
                     break
         else:
             if ": " in title:
-                parts = orig.split(": ")
-                if len(parts[-1].split()) >= 4:
-                    title = parts[-1]
+                p1 = orig.split(": ")[-1]
+                if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+                    title = p1
                 else:
                     title = orig.split(": ", 1)[1]
 
-    if not 15 < len(title) < 150:
+    if cjk.search(title) and not (4 <= len(title) < 100):
         return orig
-
+    elif not 15 < len(title) < 150:
+        return orig
+    
     return title
 
 

From 16ce81dd89bf25b179dced79070fb933857e5dc6 Mon Sep 17 00:00:00 2001
From: cdhigh <cdhigh@users.noreply.github.com>
Date: Thu, 1 May 2025 10:47:50 -0300
Subject: [PATCH 20/27] Update cleaners.py

---
 readability/cleaners.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/readability/cleaners.py b/readability/cleaners.py
index 69825c6..e0b0726 100644
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@@ -1,6 +1,9 @@
 # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
 import re
-from lxml.html.clean import Cleaner
+try:
+    from lxml.html.clean import Cleaner
+except ImportError:
+    from lxml_html_clean import Cleaner
 
 bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
 single_quoted = "'[^']+'"

From f02d865bc4afc435cc02224a6915494a33abe629 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sat, 3 May 2025 18:39:26 +0700
Subject: [PATCH 21/27] Added nose to requirements-dev so "make test" will work
 again.

---
 requirements-dev.txt | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6160e33..9f580cb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,8 +1 @@
-lxml
-lxml_html_clean
-pytest
-chardet
-nose
-pep8
-coverage
-wrapt-timeout-decorator
+nose
\ No newline at end of file

From 6f1b449962fe577e8d695d3147a9fbc21b9bd333 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sat, 3 May 2025 18:49:08 +0700
Subject: [PATCH 22/27] Better CJK support (and fix for lxml-clean), thanks
 @cdhigh

---
 .gitignore              |  3 ++-
 Makefile                |  7 ++++---
 README.rst              |  4 +++-
 pyproject.toml          | 19 +++++++++++++++++++
 readability/__init__.py |  2 +-
 5 files changed, 29 insertions(+), 6 deletions(-)
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
index d896106..b532e65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,5 @@ nosetests.xml
 .idea
 .cache
 /.noseids
-/.venv
\ No newline at end of file
+/.venv
+/poetry.lock
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 012e4b7..f1c8f21 100644
--- a/Makefile
+++ b/Makefile
@@ -50,6 +50,7 @@ clean_all: clean_venv
 # ###########
 .PHONY: dist
 dist:
+	$(PY) -m pip install wheel
 	$(PY) setup.py sdist bdist_wheel
 	$(TWINE) check dist/*
 
@@ -57,6 +58,6 @@ dist:
 upload:
 	$(TWINE) upload dist/*
 
-.PHONY: version_update
-version_update:
-	$(EDITOR) setup.py
+.PHONY: bump
+bump:
+	$(EDITOR) readability/__init__.py
diff --git a/README.rst b/README.rst
index 9b0a8b7..72b4e63 100644
--- a/README.rst
+++ b/README.rst
@@ -48,7 +48,9 @@ Usage
 
 Change Log
 ----------
-
+-  0.8.4 Better CJK support, thanks @cdhigh
+-  0.8.3.1 Support for python 3.8 - 3.13
+-  0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
 -  0.8.2 Added article author(s) (thanks @mattblaha)
 -  0.8.1 Fixed processing of non-ascii HTMLs via regexps.
 -  0.8 Replaced XHTML output with HTML5 output in summary() call.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4dad46a
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.poetry]
+name = "readability-lxml"
+version = "0.8.4"
+description = "fast html to text parser (article readability tool) with python 3 support"
+authors = ["Yuri Baburov <burchik@gmail.com>"]
+license = "Apache License 2.0"
+readme = "README.rst"
+
+[tool.poetry.dependencies]
+python = ">=3.8.2,<3.14"
+chardet = "^5.2.0"
+cssselect = "~1.2"
+lxml = {extras = ["html-clean"], version = "^5.4.0"}
+lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/readability/__init__.py b/readability/__init__.py
index 18dccae..f27111b 100644
--- a/readability/__init__.py
+++ b/readability/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.8.3"
+__version__ = "0.8.4"
 
 from .readability import Document

From 344ba9e7c4839019af1d6aace030a9425eeb06cf Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sat, 3 May 2025 18:49:08 +0700
Subject: [PATCH 23/27] Better CJK support (and fix for lxml-clean), thanks
 @cdhigh

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d6e1198..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
--e .

From be72501fec6d4924ca97cdfccfa03eaad57cc249 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sat, 3 May 2025 19:11:41 +0700
Subject: [PATCH 24/27] Updates for publishing.

---
 Makefile             |  2 +-
 README.md            | 67 +++++++++++++++++++++++++++++++++++++
 README.rst           | 78 --------------------------------------------
 requirements-dev.txt |  3 +-
 setup.py             |  4 +--
 5 files changed, 72 insertions(+), 82 deletions(-)
 create mode 100644 README.md
 delete mode 100644 README.rst

diff --git a/Makefile b/Makefile
index f1c8f21..ba14e4f 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ PY := .venv/bin/python
 PIP := .venv/bin/pip
 PEP8 := .venv/bin/pep8
 NOSE := .venv/bin/nosetests
-TWINE := twine
+TWINE := .venv/bin/twine
 
 # ###########
 # Tests rule!
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e09a515
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml)
+
+# python-readability
+
+Given an HTML document, extract and clean up the main body text and title.
+
+This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/).
+
+## Installation
+
+It's easy using `pip`, just run:
+
+```bash
+$ pip install readability-lxml
+```
+
+As an alternative, you may also use conda to install, just run:
+
+```bash
+$ conda install -c conda-forge readability-lxml
+```
+
+## Usage
+
+```python
+>>> import requests
+>>> from readability import Document
+
+>>> response = requests.get('http://example.com')
+>>> doc = Document(response.content)
+>>> doc.title()
+'Example Domain'
+
+>>> doc.summary()
+"""<html><body><div><body id="readabilityBody">\n<div>\n    <h1>Example Domain</h1>\n
+<p>This domain is established to be used for illustrative examples in documents. You may
+use this\n    domain in examples without prior coordination or asking for permission.</p>
+\n    <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>
+\n</body>\n</div></body></html>"""
+```
+
+## Change Log
+- 0.8.4 Better CJK support, thanks @cdhigh
+- 0.8.3.1 Support for python 3.8 - 3.13
+- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
+- 0.8.2 Added article author(s) (thanks @mattblaha)
+- 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
+- 0.8 Replaced XHTML output with HTML5 output in summary() call.
+- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
+- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
+- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
+- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
+- 0.4 Added Videos loading and allowed more images per paragraph
+- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
+
+## Licensing
+
+This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license.
+
+## Thanks to
+
+- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js)
+- Ruby port by starrhorne and iterationlabs
+- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk
+- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml
+- "BR to P" fix from readability.js which improves quality for smaller texts
+- Github users contributions.
diff --git a/README.rst b/README.rst
deleted file mode 100644
index 72b4e63..0000000
--- a/README.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master
-    :target: https://travis-ci.org/buriy/python-readability
-.. image:: https://img.shields.io/pypi/v/readability-lxml.svg
-    :target: https://pypi.python.org/pypi/readability-lxml
-
-python-readability
-==================
-
-Given an HTML document, extract and clean up the main body text and title.
-
-This is a Python port of a Ruby port of `arc90's Readability
-project <https://web.archive.org/web/20130519040221/http://www.readability.com/>`__.
-
-Installation
-------------
-
-It's easy using ``pip``, just run:
-
-.. code-block:: bash
-
-    $ pip install readability-lxml
-
-As an alternative, you may also use conda to install, just run:
-
-.. code-block:: bash
-
-    $ conda install -c conda-forge readability-lxml 
-
-Usage
------
-
-.. code-block:: python
-
-    >>> import requests
-    >>> from readability import Document
-
-    >>> response = requests.get('http://example.com')
-    >>> doc = Document(response.content)
-    >>> doc.title()
-    'Example Domain'
-
-    >>> doc.summary()
-    """<html><body><div><body id="readabilityBody">\n<div>\n    <h1>Example Domain</h1>\n
-    <p>This domain is established to be used for illustrative examples in documents. You may
-    use this\n    domain in examples without prior coordination or asking for permission.</p>
-    \n    <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>
-    \n</body>\n</div></body></html>"""
-
-Change Log
-----------
--  0.8.4 Better CJK support, thanks @cdhigh
--  0.8.3.1 Support for python 3.8 - 3.13
--  0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
--  0.8.2 Added article author(s) (thanks @mattblaha)
--  0.8.1 Fixed processing of non-ascii HTMLs via regexps.
--  0.8 Replaced XHTML output with HTML5 output in summary() call.
--  0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
--  0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
--  0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
--  0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
--  0.4 Added Videos loading and allowed more images per paragraph
--  0.3 Added Document.encoding, positive\_keywords and negative\_keywords
-
-Licensing
----------
-
-This code is under `the Apache License
-2.0 <http://www.apache.org/licenses/LICENSE-2.0>`__ license.
-
-Thanks to
----------
-
--  Latest `readability.js <https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js>`__
--  Ruby port by starrhorne and iterationlabs
--  `Python port <https://github.com/gfxmonk/python-readability>`__ by gfxmonk
--  `Decruft effort <https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/>` to move to lxml
--  "BR to P" fix from readability.js which improves quality for smaller texts
--  Github users contributions.
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9f580cb..bc876e5 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1 +1,2 @@
-nose
\ No newline at end of file
+nose
+twine
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 1819b41..a88e818 100755
--- a/setup.py
+++ b/setup.py
@@ -37,8 +37,8 @@ def find_version(*file_paths):
     author_email="burchik@gmail.com",
     description="fast html to text parser (article readability tool) with python 3 support",
     test_suite="tests.test_article_only",
-    long_description=open("README.rst").read(),
-    long_description_content_type='text/x-rst',
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
     license="Apache License 2.0",
     url="http://github.com/buriy/python-readability",
     packages=["readability"],

From 11c721d920c674a145e142e2d1a5bc11ea6278f9 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sun, 4 May 2025 03:57:01 +0700
Subject: [PATCH 25/27] Fix CJK title fix, added a test

---
 readability/encoding.py    |  3 +--
 readability/htmls.py       | 11 ++++----
 readability/readability.py | 10 ++++----
 tests/test_article_only.py | 51 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/readability/encoding.py b/readability/encoding.py
index c95cc14..08332df 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -1,9 +1,8 @@
 import re
 try:
-    import cchardet
+    import cchardet as chardet
 except ImportError:
     import chardet
-import sys
 
 
 RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
diff --git a/readability/htmls.py b/readability/htmls.py
index b090aa5..d99a9f5 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -123,8 +123,8 @@ def shorten_title(doc):
                 if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
                     title = p0
                     break
-                elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
-                    title = p1
+                elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
+                    title = pl
                     break
         else:
             if ": " in title:
@@ -134,11 +134,12 @@ def shorten_title(doc):
                 else:
                     title = orig.split(": ", 1)[1]
 
-    if cjk.search(title) and not (4 <= len(title) < 100):
-        return orig
+    if cjk.search(title):
+        if not (4 <= len(title) < 100):  # Allow length >= 4, cap at 100
+            return orig
     elif not 15 < len(title) < 150:
         return orig
-    
+
     return title
 
 
diff --git a/readability/readability.py b/readability/readability.py
index 286841c..c573905 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -42,11 +42,11 @@
     "divToPElementsRe": re.compile(
         r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
     ),
-    #'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
-    #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
-    #'trimRe': re.compile(r'^\s+|\s+$/'),
-    #'normalizeRe': re.compile(r'\s{2,}/'),
-    #'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
+    # 'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
+    # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
+    # 'trimRe': re.compile(r'^\s+|\s+$/'),
+    # 'normalizeRe': re.compile(r'\s{2,}/'),
+    # 'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
     "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
     # skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
 }
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
index d6cef52..fe32212 100644
--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@@ -149,6 +149,7 @@ def test_utf8_kanji(self):
         sample = load_sample("utf-8-kanji.sample.html")
         doc = Document(sample)
         res = doc.summary()
+        assert 0 < len(res) < 10000
 
     def test_author_present(self):
         sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
@@ -180,3 +181,53 @@ def test_keep_images_absent_by_defautl(self):
         doc = Document(sample)
 
         assert "<img" not in doc.summary()
+
+    def test_cjk_summary(self):
+        """Check we can extract CJK text correctly."""
+        html = """
+        <html>
+            <head>
+                <title>这是标题</title>
+            </head>
+            <body>
+                <div>一些无关紧要的内容</div>
+                <div class="article-content">
+                    <h1>主要文章标题</h1>
+                    <p>这是主要内容的第一段。</p>
+                    <p>これはコンテンツの第2段落です。</p>
+                    <p>이것은 콘텐츠의 세 번째 단락입니다.</p>
+                    <p>This is the fourth paragraph.</p>
+                </div>
+                <div>More irrelevant stuff</div>
+            </body>
+        </html>
+        """
+        doc = Document(html)
+        summary = doc.summary()
+        # Check that the main CJK content is present in the summary
+        self.assertTrue("这是主要内容的第一段" in summary)
+        self.assertTrue("これはコンテンツの第2段落です" in summary)
+        self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary)
+        # Check that irrelevant content is mostly gone
+        self.assertFalse("一些无关紧要的内容" in summary)
+
+    def test_shorten_title_delimiter_bug(self):
+        """Test that shorten_title handles delimiters correctly when the last part is valid.
+
+        This specifically targets a potential bug where 'p1' might be used instead of 'pl'.
+        """
+        html = """
+        <html>
+            <head>
+                <title>Short Part | これは長いです</title>
+            </head>
+            <body>
+                <div>Content</div>
+            </body>
+        </html>
+        """
+        doc = Document(html)
+        # With the bug, this call might raise NameError: name 'p1' is not defined
+        # With the fix, it should correctly return the last part.
+        short_title = doc.short_title()
+        self.assertEqual(short_title, "これは長いです")

From 72318f15af98e24a67c48b5abb646f6f2e3f109a Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sun, 4 May 2025 03:57:26 +0700
Subject: [PATCH 26/27] Fix poetry builds

---
 Makefile             |  6 +++++-
 pyproject.toml       | 10 ++++++++--
 requirements-dev.txt |  3 ++-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index ba14e4f..52502f3 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,7 @@ all: setup develop
 venv: .venv/bin/python
 
 setup: venv
-	$(PIP) install -r requirements-dev.txt
+	$(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true
 
 .venv/bin/python:
 	test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv
@@ -45,6 +45,10 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
 .PHONY: clean_all
 clean_all: clean_venv
 
+.PHONY: build
+build:
+	poetry build
+
 # ###########
 # Deploy
 # ###########
diff --git a/pyproject.toml b/pyproject.toml
index 4dad46a..cff93b1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,12 +4,18 @@ version = "0.8.4"
 description = "fast html to text parser (article readability tool) with python 3 support"
 authors = ["Yuri Baburov <burchik@gmail.com>"]
 license = "Apache License 2.0"
-readme = "README.rst"
+readme = "README.md"
+packages = [
+    { include = "readability" },
+]
 
 [tool.poetry.dependencies]
 python = ">=3.8.2,<3.14"
 chardet = "^5.2.0"
-cssselect = "~1.2"
+cssselect = [
+    { version = "~1.2", markers = "python_version < '3.9'" },
+    { version = "~1.3", markers = "python_version >= '3.9'" }
+]
 lxml = {extras = ["html-clean"], version = "^5.4.0"}
 lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index bc876e5..996bbfc 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,2 +1,3 @@
 nose
-twine
\ No newline at end of file
+twine
+flake8
\ No newline at end of file

From c8d8011f3d4c69d7667a52395237e56e66af8ea4 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sun, 4 May 2025 04:09:54 +0700
Subject: [PATCH 27/27] Bump version to 0.8.4.1

---
 .flake8                 | 2 ++
 Makefile                | 6 ++++++
 pyproject.toml          | 2 +-
 readability/__init__.py | 2 +-
 4 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 .flake8

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..b33811f
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501, W503 
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 52502f3..9caf08a 100644
--- a/Makefile
+++ b/Makefile
@@ -65,3 +65,9 @@ upload:
 .PHONY: bump
 bump:
 	$(EDITOR) readability/__init__.py
+	$(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2))
+	# fix first occurrence of version in pyproject.toml
+	sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml
+	git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py
+	git tag $(VERSION)
+	git push --tags
diff --git a/pyproject.toml b/pyproject.toml
index cff93b1..4499285 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "readability-lxml"
-version = "0.8.4"
+version = "0.8.4.1"
 description = "fast html to text parser (article readability tool) with python 3 support"
 authors = ["Yuri Baburov <burchik@gmail.com>"]
 license = "Apache License 2.0"
diff --git a/readability/__init__.py b/readability/__init__.py
index f27111b..b36f021 100644
--- a/readability/__init__.py
+++ b/readability/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.8.4"
+__version__ = "0.8.4.1"
 
 from .readability import Document