From e13814d9678f7821d49619d89532a680b271e439 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 21:37:40 +1000 Subject: [PATCH 01/27] Drop support for EOL OSX <10.9 --- setup.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/setup.py b/setup.py index 2770abe..9c7dd48 100755 --- a/setup.py +++ b/setup.py @@ -7,16 +7,6 @@ from setuptools import setup import sys -lxml_requirement = "lxml" -if sys.platform == "darwin": - import platform - - mac_ver = platform.mac_ver()[0] - mac_major, mac_minor = mac_ver.split('.')[:2] - if int(mac_major) == 10 and int(mac_minor) < 9: - print("Using lxml<2.4") - lxml_requirement = "lxml<2.4" - speed_deps = [ "cchardet", ] @@ -60,7 +50,7 @@ def find_version(*file_paths): license="Apache License 2.0", url="http://github.com/buriy/python-readability", packages=["readability", "readability.compat"], - install_requires=["chardet", lxml_requirement, "cssselect"], + install_requires=["chardet", "lxml", "cssselect"], tests_require=test_deps, extras_require=extras, classifiers=[ From 4ebadbdace5153ab1e3a2a60dd010f6e2626cd22 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 21:39:44 +1000 Subject: [PATCH 02/27] Add missing classifiers for supported python versions --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9c7dd48..8f6e39a 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,6 @@ import os import re from setuptools import setup -import sys speed_deps = [ "cchardet", @@ -69,5 +68,8 @@ def find_version(*file_paths): "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: Implementation :: PyPy", ], ) From 7fae6223e12709d5bf22cf929e73a22b5565ef44 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 21:43:26 +1000 Subject: [PATCH 03/27] Fix support for lxml>=5.2.0 --- setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8f6e39a..cca17d4 100755 --- a/setup.py +++ b/setup.py @@ -49,7 +49,12 @@ def find_version(*file_paths): license="Apache License 2.0", url="http://github.com/buriy/python-readability", packages=["readability", "readability.compat"], - install_requires=["chardet", "lxml", "cssselect"], + install_requires=[ + "chardet", + "lxml[html_clean]", + "lxml-html-clean; python_version < '3.11'", + "cssselect" + ], tests_require=test_deps, extras_require=extras, classifiers=[ From 2c90062c4c96baba7d8020ae921c63f2a41fa3d3 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 21:51:50 +1000 Subject: [PATCH 04/27] Drop support for EOL Python 2.7 --- .travis.yml | 7 ---- doc/source/conf.py | 1 - readability/compat/__init__.py | 26 -------------- readability/compat/three.py | 6 ---- readability/compat/two.py | 6 ---- readability/encoding.py | 9 +++-- readability/htmls.py | 19 +++++----- readability/readability.py | 63 ++++++++++++---------------------- setup.py | 5 +-- tox.ini | 2 +- 10 files changed, 36 insertions(+), 108 deletions(-) delete mode 100644 readability/compat/__init__.py delete mode 100644 readability/compat/three.py delete mode 100644 readability/compat/two.py diff --git a/.travis.yml b/.travis.yml index 21e1ce1..cab9e23 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,9 +4,6 @@ cache: pip matrix: include: - - name: "Python 2.7 on Linux" - python: 2.7 - env: PIP=pip - name: "Python 3.5 on Linux" python: 3.5 - name: "Python 3.6 on Linux" @@ -19,9 +16,6 @@ matrix: - name: "Python 3.9 Nightly on Linux" dist: bionic python: nightly - - name: "Pypy on Linux" - python: pypy - env: PIP=pip - name: "Pypy 3 on Linux" python: pypy3 - name: "Python 3.7 on older macOS" @@ -44,7 +38,6 @@ matrix: - pip3 --version allow_failures: - python: nightly - - python: pypy - python: pypy3 - os: osx diff --git a/doc/source/conf.py b/doc/source/conf.py index bb26134..a099772 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # readability documentation build configuration file, created by # sphinx-quickstart on Thu Mar 23 16:29:38 2017. diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py deleted file mode 100644 index caf0ea8..0000000 --- a/readability/compat/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -This module contains compatibility helpers for Python 2/3 interoperability. - -It mainly exists because their are certain incompatibilities in the Python -syntax that can only be solved by conditionally importing different functions. -""" -import sys -from lxml.etree import tostring - -if sys.version_info[0] == 2: - bytes_ = str - str_ = unicode - def tostring_(s): - return tostring(s, encoding='utf-8').decode('utf-8') - -elif sys.version_info[0] == 3: - bytes_ = bytes - str_ = str - def tostring_(s): - return tostring(s, encoding='utf-8') - - -try: - from re import Pattern as pattern_type -except ImportError: - from re import _pattern_type as pattern_type diff --git a/readability/compat/three.py b/readability/compat/three.py deleted file mode 100644 index 2635157..0000000 --- a/readability/compat/three.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs).with_traceback(traceback) diff --git a/readability/compat/two.py b/readability/compat/two.py deleted file mode 100644 index 642ecb7..0000000 --- a/readability/compat/two.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs), None, traceback diff --git a/readability/encoding.py b/readability/encoding.py index 212ff92..c95cc14 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -39,11 +39,10 @@ def get_encoding(page): for declared_encoding in declared_encodings: try: # Python3 only - if sys.version_info[0] == 3: - # declared_encoding will actually be bytes but .decode() only - # accepts `str` type. Decode blindly with ascii because no one should - # ever use non-ascii characters in the name of an encoding. - declared_encoding = declared_encoding.decode("ascii", "replace") + # declared_encoding will actually be bytes but .decode() only + # accepts `str` type. Decode blindly with ascii because no one should + # ever use non-ascii characters in the name of an encoding. + declared_encoding = declared_encoding.decode("ascii", "replace") encoding = fix_charset(declared_encoding) # Now let's decode the page diff --git a/readability/htmls.py b/readability/htmls.py index acacb5a..87299f5 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -4,13 +4,12 @@ from .cleaners import normalize_spaces, clean_attributes from .encoding import get_encoding -from .compat import str_ utf8_parser = lxml.html.HTMLParser(encoding="utf-8") def build_doc(page): - if isinstance(page, str_): + if isinstance(page, str): encoding = None decoded_page = page else: @@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl): def normalize_entities(cur_title): entities = { - u"\u2014": "-", - u"\u2013": "-", - u"—": "-", - u"–": "-", - u"\u00A0": " ", - u"\u00AB": '"', - u"\u00BB": '"', - u""": '"', + "\u2014": "-", + "\u2013": "-", + "—": "-", + "–": "-", + "\u00A0": " ", + "\u00AB": '"', + "\u00BB": '"', + """: '"', } for c, r in entities.items(): if c in cur_title: diff --git a/readability/readability.py b/readability/readability.py index f16b170..5fc8b32 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,9 +1,12 @@ #!/usr/bin/env python -from __future__ import print_function import logging import re import sys +import urllib.request +import urllib.parse +import urllib.error +from lxml.etree import tostring from lxml.etree import tounicode from lxml.etree import _ElementTree from lxml.html import document_fromstring @@ -17,7 +20,6 @@ from .htmls import get_title from .htmls import get_author from .htmls import shorten_title -from .compat import str_, bytes_, tostring_, pattern_type from .debug import describe, text_content @@ -80,14 +82,14 @@ def text_length(i): def compile_pattern(elements): if not elements: return None - elif isinstance(elements, pattern_type): + elif isinstance(elements, re.Pattern): return elements - elif isinstance(elements, (str_, bytes_)): - if isinstance(elements, bytes_): - elements = str_(elements, "utf-8") - elements = elements.split(u",") + elif isinstance(elements, (str, bytes)): + if isinstance(elements, bytes): + elements = str(elements, "utf-8") + elements = elements.split(",") if isinstance(elements, (list, tuple)): - return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U) + return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U) else: raise Exception("Unknown type for the pattern: {}".format(type(elements))) # assume string or string like object @@ -242,19 +244,15 @@ def summary(self, html_partial=False): log.info("ruthless removal did not work. ") ruthless = False log.debug( - ( "ended up stripping too much - " "going for a safer _parse" - ) ) # try again continue else: log.debug( - ( "Ruthless and lenient parsing did not work. " "Returning raw html" - ) ) article = self.html.find("body") if article is None: @@ -272,11 +270,7 @@ def summary(self, html_partial=False): return cleaned_article except Exception as e: log.exception("error getting summary: ") - if sys.version_info[0] == 2: - from .compat.two import raise_with_traceback - else: - from .compat.three import raise_with_traceback - raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e)) + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -474,7 +468,8 @@ def transform_misused_divs_into_paragraphs(self): # This results in incorrect results in case there is an # buried within an for example if not REGEXES["divToPElementsRe"].search( - str_(b"".join(map(tostring_, list(elem)))) + str(b"".join(tostring(s, encoding='utf-8') for s in elem)) + # str(b"".join(map(tostring_, list(elem)))) ): # log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" @@ -501,13 +496,11 @@ def transform_misused_divs_into_paragraphs(self): def tags(self, node, *tag_names): for tag_name in tag_names: - for e in node.findall(".//%s" % tag_name): - yield e + yield from node.findall(".//%s" % tag_name) def reverse_tags(self, node, *tag_names): for tag_name in tag_names: - for e in reversed(node.findall(".//%s" % tag_name)): - yield e + yield from reversed(node.findall(".//%s" % tag_name)) def sanitize(self, node, candidates): MIN_LEN = self.min_text_length @@ -594,13 +587,13 @@ def sanitize(self, node, candidates): ) to_remove = True elif weight < 25 and link_density > 0.2: - reason = "too many links %.3f for its weight %s" % ( + reason = "too many links {:.3f} for its weight {}".format( link_density, weight, ) to_remove = True elif weight >= 25 and link_density > 0.5: - reason = "too many links %.3f for its weight %s" % ( + reason = "too many links {:.3f} for its weight {}".format( link_density, weight, ) @@ -726,18 +719,10 @@ def main(): file = None if options.url: headers = {"User-Agent": "Mozilla/5.0"} - if sys.version_info[0] == 3: - import urllib.request, urllib.parse, urllib.error - - request = urllib.request.Request(options.url, None, headers) - file = urllib.request.urlopen(request) - else: - import urllib2 - - request = urllib2.Request(options.url, None, headers) - file = urllib2.urlopen(request) + request = urllib.request.Request(options.url, None, headers) + file = urllib.request.urlopen(request) else: - file = open(args[0], "rt") + file = open(args[0]) try: doc = Document( file.read(), @@ -751,14 +736,8 @@ def main(): result = "

" + doc.short_title() + "


" + doc.summary() open_in_browser(result) else: - enc = ( - sys.__stdout__.encoding or "utf-8" - ) # XXX: this hack could not always work, better to set PYTHONIOENCODING result = "Title:" + doc.short_title() + "\n" + doc.summary() - if sys.version_info[0] == 3: - print(result) - else: - print(result.encode(enc, "replace")) + print(result) finally: file.close() diff --git a/setup.py b/setup.py index cca17d4..032c057 100755 --- a/setup.py +++ b/setup.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -from __future__ import print_function import codecs import os import re @@ -48,7 +47,7 @@ def find_version(*file_paths): long_description_content_type='text/x-rst', license="Apache License 2.0", url="http://github.com/buriy/python-readability", - packages=["readability", "readability.compat"], + packages=["readability"], install_requires=[ "chardet", "lxml[html_clean]", @@ -66,8 +65,6 @@ def find_version(*file_paths): "Topic :: Internet", "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", diff --git a/tox.ini b/tox.ini index d695433..7742484 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{27,35,36,37,38,39,310,py,py3}, doc + py{35,36,37,38,39,310,py3}, doc skip_missing_interpreters = True From 4cf9eedfb3693da4d26c4c02ec316b83951cf020 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 21:57:26 +1000 Subject: [PATCH 05/27] Drop support for EOL Python 3.5 --- .travis.yml | 2 -- readability/readability.py | 6 +++--- setup.py | 1 - tox.ini | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index cab9e23..9f032c7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,6 @@ cache: pip matrix: include: - - name: "Python 3.5 on Linux" - python: 3.5 - name: "Python 3.6 on Linux" python: 3.6 - name: "Python 3.7 on Linux" diff --git a/readability/readability.py b/readability/readability.py index 5fc8b32..c86e7d1 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -91,7 +91,7 @@ def compile_pattern(elements): if isinstance(elements, (list, tuple)): return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U) else: - raise Exception("Unknown type for the pattern: {}".format(type(elements))) + raise Exception(f"Unknown type for the pattern: {type(elements)}") # assume string or string like object @@ -332,7 +332,7 @@ def select_best_candidate(self, candidates): ) for candidate in sorted_candidates[:5]: elem = candidate["elem"] - log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem))) + log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem))) best_candidate = sorted_candidates[0] return best_candidate @@ -448,7 +448,7 @@ def score_node(self, elem): def remove_unlikely_candidates(self): for elem in self.html.findall(".//*"): - s = "%s %s" % (elem.get("class", ""), elem.get("id", "")) + s = "{} {}".format(elem.get("class", ""), elem.get("id", "")) if len(s) < 2: continue if ( diff --git a/setup.py b/setup.py index 032c057..b5d47da 100755 --- a/setup.py +++ b/setup.py @@ -66,7 +66,6 @@ def find_version(*file_paths): "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", diff --git a/tox.ini b/tox.ini index 7742484..532a20f 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{35,36,37,38,39,310,py3}, doc + py{36,37,38,39,310,py3}, doc skip_missing_interpreters = True From 0b01ac6972e19c126e6235c06ed6afdb16712a50 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 21:58:12 +1000 Subject: [PATCH 06/27] Drop support for EOL Python 3.6 --- .travis.yml | 2 -- setup.py | 1 - tox.ini | 2 +- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9f032c7..6fb4640 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,6 @@ cache: pip matrix: include: - - name: "Python 3.6 on Linux" - python: 3.6 - name: "Python 3.7 on Linux" python: 3.7 - name: "Python 3.8 on Linux" diff --git a/setup.py b/setup.py index b5d47da..dfb5db5 100755 --- a/setup.py +++ b/setup.py @@ -66,7 +66,6 @@ def find_version(*file_paths): "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", diff --git a/tox.ini b/tox.ini index 532a20f..53d7873 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{36,37,38,39,310,py3}, doc + py{38,39,310,py,py3}, doc skip_missing_interpreters = True From 26f11c05d8e11d826e2d9438c714d4f9bb6a1d43 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 21:59:19 +1000 Subject: [PATCH 07/27] Drop support for EOL Python 3.7 --- .travis.yml | 20 -------------------- setup.py | 1 - tox.ini | 2 +- 3 files changed, 1 insertion(+), 22 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6fb4640..ea56f51 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,8 +4,6 @@ cache: pip matrix: include: - - name: "Python 3.7 on Linux" - python: 3.7 - name: "Python 3.8 on Linux" dist: xenial python: 3.8 @@ -14,24 +12,6 @@ matrix: python: nightly - name: "Pypy 3 on Linux" python: pypy3 - - name: "Python 3.7 on older macOS" - os: osx - osx_image: xcode9.4 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version - - name: "Python 3.7 on macOS" - os: osx - osx_image: xcode11 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version allow_failures: - python: nightly - python: pypy3 diff --git a/setup.py b/setup.py index dfb5db5..1a445fb 100755 --- a/setup.py +++ b/setup.py @@ -66,7 +66,6 @@ def find_version(*file_paths): "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", diff --git a/tox.ini b/tox.ini index 53d7873..b78bd0b 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{38,39,310,py,py3}, doc + py{38,39,310,py3}, doc skip_missing_interpreters = True From b34c8d98fde88fe04d39ad241b933ca6a73fc86f Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 22:00:05 +1000 Subject: [PATCH 08/27] Add Python 3.11 to tox matrix --- setup.py | 1 + tox.ini | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1a445fb..a71fba7 100755 --- a/setup.py +++ b/setup.py @@ -69,6 +69,7 @@ def find_version(*file_paths): "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Programming Language :: Python :: Implementation :: PyPy", ], ) diff --git a/tox.ini b/tox.ini index b78bd0b..1bcbb6e 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{38,39,310,py3}, doc + py{38,39,310,311,py3}, doc skip_missing_interpreters = True From 2987875dea3c89e966ee09c393a0015d7b8bb8da Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 22:01:20 +1000 Subject: [PATCH 09/27] Add Python 3.12 to tox matrix --- setup.py | 1 + tox.ini | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a71fba7..294572d 100755 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ def find_version(*file_paths): "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: PyPy", ], ) diff --git a/tox.ini b/tox.ini index 1bcbb6e..ff8f68c 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{38,39,310,311,py3}, doc + py{38,39,310,311,312,py3}, doc skip_missing_interpreters = True From 24d97d1591aaf28695fbb2700523669abfcea4f8 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 22:06:51 +1000 Subject: [PATCH 10/27] Update documentation build to use sphinx-build --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index ff8f68c..d4adf61 100644 --- a/tox.ini +++ b/tox.ini @@ -30,4 +30,4 @@ commands = [testenv:doc] commands = - python setup.py build_sphinx + sphinx-build -b html doc/source/ build/ From 2e48d37c1a93048de51c8cd19416b4794697bf32 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 22:07:59 +1000 Subject: [PATCH 11/27] Fix warning during doc build --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index a099772..afb13f7 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -71,7 +71,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. From acb2f3d019b31f99bba176b2faa9ed73ef224e2b Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Thu, 15 Aug 2024 22:09:59 +1000 Subject: [PATCH 12/27] Replace deprecated recommonmark with myst-parser --- doc/source/conf.py | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index afb13f7..e70cf9b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -37,7 +37,7 @@ "sphinx.ext.doctest", "sphinx.ext.intersphinx", "sphinx.ext.todo", - "recommonmark", + "myst_parser", ] # Add any paths that contain templates here, relative to this directory. diff --git a/tox.ini b/tox.ini index d4adf61..3f03df8 100644 --- a/tox.ini +++ b/tox.ini @@ -14,7 +14,7 @@ deps = pytest doc: sphinx doc: sphinx_rtd_theme - doc: recommonmark + doc: myst-parser # This creates the virtual envs with --site-packages so already packages # that are already installed will be reused. This is especially useful on From 7fcf70bea765a60a1cf05d1cdea60d3c3f56ab46 Mon Sep 17 00:00:00 2001 From: Daniel Bowring Date: Sat, 12 Oct 2024 02:34:48 +1100 Subject: [PATCH 13/27] Add support for python 3.13 --- setup.py | 1 + tox.ini | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 294572d..26894d4 100755 --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ def find_version(*file_paths): "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: PyPy", ], ) diff --git a/tox.ini b/tox.ini index 3f03df8..926fda5 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{38,39,310,311,312,py3}, doc + py{38,39,310,311,312,313,py3}, doc skip_missing_interpreters = True From 1986e25df8cda751d30599d676cece6942fabd38 Mon Sep 17 00:00:00 2001 From: botlabsDev <54632107+botlabsDev@users.noreply.github.com> Date: Sun, 12 Jan 2025 16:31:37 +0100 Subject: [PATCH 14/27] Fix issue #89, introduce flag option to keep images in summary. --- readability/readability.py | 11 +++---- requirements-dev.txt | 2 ++ .../summary-keep-all-images.sample.html | 29 +++++++++++++++++++ tests/test_article_only.py | 21 ++++++++++++++ 4 files changed, 58 insertions(+), 5 deletions(-) create mode 100644 tests/samples/summary-keep-all-images.sample.html diff --git a/readability/readability.py b/readability/readability.py index c86e7d1..286841c 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -210,12 +210,13 @@ def get_clean_html(self): """ return clean_attributes(tounicode(self.html, method="html")) - def summary(self, html_partial=False): + def summary(self, html_partial=False, keep_all_images=False): """ Given a HTML file, extracts the text of the article. :param html_partial: return only the div of the document, don't wrap in html and body tags. + :param keep_all_images: Keep all images in summary. Warning: It mutates internal DOM representation of the HTML document, so it is better to call other API methods before this one. @@ -257,7 +258,7 @@ def summary(self, html_partial=False): article = self.html.find("body") if article is None: article = self.html - cleaned_article = self.sanitize(article, candidates) + cleaned_article = self.sanitize(article, candidates, keep_all_images) article_length = len(cleaned_article or "") retry_length = self.retry_length @@ -502,7 +503,7 @@ def reverse_tags(self, node, *tag_names): for tag_name in tag_names: yield from reversed(node.findall(".//%s" % tag_name)) - def sanitize(self, node, candidates): + def sanitize(self, node, candidates, keep_all_images=False): MIN_LEN = self.min_text_length for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: @@ -563,8 +564,8 @@ def sanitize(self, node, candidates): to_remove = False reason = "" - # if el.tag == 'div' and counts["img"] >= 1: - # continue + if keep_all_images and el.tag == 'div' and counts["img"] >= 1: + continue if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: reason = "too many images (%s)" % counts["img"] to_remove = True diff --git a/requirements-dev.txt b/requirements-dev.txt index 4731fa9..6160e33 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,6 @@ lxml +lxml_html_clean +pytest chardet nose pep8 diff --git a/tests/samples/summary-keep-all-images.sample.html b/tests/samples/summary-keep-all-images.sample.html new file mode 100644 index 0000000..127683f --- /dev/null +++ b/tests/samples/summary-keep-all-images.sample.html @@ -0,0 +1,29 @@ + + + + +

+ + H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline + +

+

+ + Text Text Text Text Text Text Text Text Text Text + +

+
+ + + + + +
+

+ + Text Text Text Text Text Text Text Text Text Text + +

+ + \ No newline at end of file diff --git a/tests/test_article_only.py b/tests/test_article_only.py index c5592cf..1835d9f 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -133,3 +133,24 @@ def test_author_absent(self): sample = load_sample("si-game.sample.html") doc = Document(sample) assert '[no-author]' == doc.author() + + def test_keep_images_present(self): + sample = load_sample("summary-keep-all-images.sample.html") + + doc = Document(sample) + + assert " Date: Mon, 13 Jan 2025 02:05:09 +0700 Subject: [PATCH 15/27] Create python-package.yml --- .github/workflows/python-package.yml | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..73784a4 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest From efebb4cdd90fdff58b1507534a2b2c72ff7680d6 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Mon, 13 Jan 2025 03:00:49 +0700 Subject: [PATCH 16/27] Removed wrapt decorator --- setup.py | 7 ------- tests/test_article_only.py | 30 ++++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 26894d4..1819b41 100755 --- a/setup.py +++ b/setup.py @@ -9,14 +9,8 @@ "cchardet", ] -test_deps = [ - # Test timeouts - "wrapt-timeout-decorator", -] - extras = { 'speed': speed_deps, - 'test': test_deps, } # Adapted from https://github.com/pypa/pip/blob/master/setup.py @@ -54,7 +48,6 @@ def find_version(*file_paths): "lxml-html-clean; python_version < '3.11'", "cssselect" ], - tests_require=test_deps, extras_require=extras, classifiers=[ "Environment :: Web Environment", diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 1835d9f..d6cef52 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -1,8 +1,34 @@ import os +import time import unittest from readability import Document -from wrapt_timeout_decorator import * +from functools import wraps + + +class TimeoutException(Exception): + """Exception raised when a function exceeds its time limit.""" + pass + + +def timeout(seconds): + """Decorator to enforce a timeout on function execution.""" + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + elapsed_time = end_time - start_time + if elapsed_time > seconds: + raise TimeoutException( + f"Function '{func.__name__}' exceeded time limit of {seconds} seconds " + f"with an execution time of {elapsed_time:.4f} seconds" + ) + return result + return wrapper + return decorator + SAMPLES = os.path.join(os.path.dirname(__file__), "samples") @@ -100,7 +126,7 @@ def test_correct_cleanup(self): assert not "aside" in s # Many spaces make some regexes run forever - @timeout(3, use_signals=False) + @timeout(3) def test_many_repeated_spaces(self): long_space = " " * 1000000 sample = "

foo" + long_space + "

" From 956bfbbe46597cdb678ab54e5b1dfada0e26da13 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Mon, 13 Jan 2025 03:02:49 +0700 Subject: [PATCH 17/27] Update python-package.yml --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 73784a4..23f1610 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 From b220919186d0db2a006bbabb910b602056b3f51a Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Thu, 16 Jan 2025 02:48:45 +0700 Subject: [PATCH 18/27] Bump to 0.8.3 --- readability/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readability/__init__.py b/readability/__init__.py index 6a263bf..18dccae 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.2" +__version__ = "0.8.3" from .readability import Document From c1574456f5aefc1dc05d7def332c48e3799e214c Mon Sep 17 00:00:00 2001 From: cdhigh Date: Thu, 1 May 2025 10:37:30 -0300 Subject: [PATCH 19/27] shorten_title supports CJK character sets. --- readability/htmls.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/readability/htmls.py b/readability/htmls.py index 87299f5..b090aa5 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -110,29 +110,35 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) + cjk = re.compile('[\u4e00-\u9fff]+') + if candidates: title = sorted(candidates, key=len)[-1] else: for delimiter in [" | ", " - ", " :: ", " / "]: if delimiter in title: parts = orig.split(delimiter) - if len(parts[0].split()) >= 4: - title = parts[0] + p0 = parts[0] + pl = parts[-1] + if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)): + title = p0 break - elif len(parts[-1].split()) >= 4: - title = parts[-1] + elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 break else: if ": " in title: - parts = orig.split(": ") - if len(parts[-1].split()) >= 4: - title = parts[-1] + p1 = orig.split(": ")[-1] + if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 else: title = orig.split(": ", 1)[1] - if not 15 < len(title) < 150: + if cjk.search(title) and not (4 <= len(title) < 100): return orig - + elif not 15 < len(title) < 150: + return orig + return title From 16ce81dd89bf25b179dced79070fb933857e5dc6 Mon Sep 17 00:00:00 2001 From: cdhigh Date: Thu, 1 May 2025 10:47:50 -0300 Subject: [PATCH 20/27] Update cleaners.py --- readability/cleaners.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/readability/cleaners.py b/readability/cleaners.py index 69825c6..e0b0726 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -1,6 +1,9 @@ # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds import re -from lxml.html.clean import Cleaner +try: + from lxml.html.clean import Cleaner +except ImportError: + from lxml_html_clean import Cleaner bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"] single_quoted = "'[^']+'" From f02d865bc4afc435cc02224a6915494a33abe629 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sat, 3 May 2025 18:39:26 +0700 Subject: [PATCH 21/27] Added nose to requirements-dev so "make test" will work again. --- requirements-dev.txt | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 6160e33..9f580cb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,8 +1 @@ -lxml -lxml_html_clean -pytest -chardet -nose -pep8 -coverage -wrapt-timeout-decorator +nose \ No newline at end of file From 6f1b449962fe577e8d695d3147a9fbc21b9bd333 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sat, 3 May 2025 18:49:08 +0700 Subject: [PATCH 22/27] Better CJK support (and fix for lxml-clean), thanks @cdhigh --- .gitignore | 3 ++- Makefile | 7 ++++--- README.rst | 4 +++- pyproject.toml | 19 +++++++++++++++++++ readability/__init__.py | 2 +- 5 files changed, 29 insertions(+), 6 deletions(-) create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index d896106..b532e65 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ nosetests.xml .idea .cache /.noseids -/.venv \ No newline at end of file +/.venv +/poetry.lock \ No newline at end of file diff --git a/Makefile b/Makefile index 012e4b7..f1c8f21 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,7 @@ clean_all: clean_venv # ########### .PHONY: dist dist: + $(PY) -m pip install wheel $(PY) setup.py sdist bdist_wheel $(TWINE) check dist/* @@ -57,6 +58,6 @@ dist: upload: $(TWINE) upload dist/* -.PHONY: version_update -version_update: - $(EDITOR) setup.py +.PHONY: bump +bump: + $(EDITOR) readability/__init__.py diff --git a/README.rst b/README.rst index 9b0a8b7..72b4e63 100644 --- a/README.rst +++ b/README.rst @@ -48,7 +48,9 @@ Usage Change Log ---------- - +- 0.8.4 Better CJK support, thanks @cdhigh +- 0.8.3.1 Support for python 3.8 - 3.13 +- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev - 0.8.2 Added article author(s) (thanks @mattblaha) - 0.8.1 Fixed processing of non-ascii HTMLs via regexps. - 0.8 Replaced XHTML output with HTML5 output in summary() call. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4dad46a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[tool.poetry] +name = "readability-lxml" +version = "0.8.4" +description = "fast html to text parser (article readability tool) with python 3 support" +authors = ["Yuri Baburov "] +license = "Apache License 2.0" +readme = "README.rst" + +[tool.poetry.dependencies] +python = ">=3.8.2,<3.14" +chardet = "^5.2.0" +cssselect = "~1.2" +lxml = {extras = ["html-clean"], version = "^5.4.0"} +lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"} + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/readability/__init__.py b/readability/__init__.py index 18dccae..f27111b 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.3" +__version__ = "0.8.4" from .readability import Document From 344ba9e7c4839019af1d6aace030a9425eeb06cf Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sat, 3 May 2025 18:49:08 +0700 Subject: [PATCH 23/27] Better CJK support (and fix for lxml-clean), thanks @cdhigh --- requirements.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d6e1198..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ --e . From be72501fec6d4924ca97cdfccfa03eaad57cc249 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sat, 3 May 2025 19:11:41 +0700 Subject: [PATCH 24/27] Updates for publishing. --- Makefile | 2 +- README.md | 67 +++++++++++++++++++++++++++++++++++++ README.rst | 78 -------------------------------------------- requirements-dev.txt | 3 +- setup.py | 4 +-- 5 files changed, 72 insertions(+), 82 deletions(-) create mode 100644 README.md delete mode 100644 README.rst diff --git a/Makefile b/Makefile index f1c8f21..ba14e4f 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ PY := .venv/bin/python PIP := .venv/bin/pip PEP8 := .venv/bin/pep8 NOSE := .venv/bin/nosetests -TWINE := twine +TWINE := .venv/bin/twine # ########### # Tests rule! diff --git a/README.md b/README.md new file mode 100644 index 0000000..e09a515 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml) + +# python-readability + +Given an HTML document, extract and clean up the main body text and title. + +This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/). + +## Installation + +It's easy using `pip`, just run: + +```bash +$ pip install readability-lxml +``` + +As an alternative, you may also use conda to install, just run: + +```bash +$ conda install -c conda-forge readability-lxml +``` + +## Usage + +```python +>>> import requests +>>> from readability import Document + +>>> response = requests.get('http://example.com') +>>> doc = Document(response.content) +>>> doc.title() +'Example Domain' + +>>> doc.summary() +"""
\n
\n

Example Domain

\n +

This domain is established to be used for illustrative examples in documents. You may +use this\n domain in examples without prior coordination or asking for permission.

+\n

More information...

\n
+\n\n
""" +``` + +## Change Log +- 0.8.4 Better CJK support, thanks @cdhigh +- 0.8.3.1 Support for python 3.8 - 3.13 +- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev +- 0.8.2 Added article author(s) (thanks @mattblaha) +- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. +- 0.8 Replaced XHTML output with HTML5 output in summary() call. +- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. +- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). +- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords + +## Licensing + +This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license. + +## Thanks to + +- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js) +- Ruby port by starrhorne and iterationlabs +- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk +- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml +- "BR to P" fix from readability.js which improves quality for smaller texts +- Github users contributions. diff --git a/README.rst b/README.rst deleted file mode 100644 index 72b4e63..0000000 --- a/README.rst +++ /dev/null @@ -1,78 +0,0 @@ -.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master - :target: https://travis-ci.org/buriy/python-readability -.. image:: https://img.shields.io/pypi/v/readability-lxml.svg - :target: https://pypi.python.org/pypi/readability-lxml - -python-readability -================== - -Given an HTML document, extract and clean up the main body text and title. - -This is a Python port of a Ruby port of `arc90's Readability -project `__. - -Installation ------------- - -It's easy using ``pip``, just run: - -.. code-block:: bash - - $ pip install readability-lxml - -As an alternative, you may also use conda to install, just run: - -.. code-block:: bash - - $ conda install -c conda-forge readability-lxml - -Usage ------ - -.. code-block:: python - - >>> import requests - >>> from readability import Document - - >>> response = requests.get('http://example.com') - >>> doc = Document(response.content) - >>> doc.title() - 'Example Domain' - - >>> doc.summary() - """
\n
\n

Example Domain

\n -

This domain is established to be used for illustrative examples in documents. You may - use this\n domain in examples without prior coordination or asking for permission.

- \n

More information...

\n
- \n\n
""" - -Change Log ----------- -- 0.8.4 Better CJK support, thanks @cdhigh -- 0.8.3.1 Support for python 3.8 - 3.13 -- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev -- 0.8.2 Added article author(s) (thanks @mattblaha) -- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. -- 0.8 Replaced XHTML output with HTML5 output in summary() call. -- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. -- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). -- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords - -Licensing ---------- - -This code is under `the Apache License -2.0 `__ license. - -Thanks to ---------- - -- Latest `readability.js `__ -- Ruby port by starrhorne and iterationlabs -- `Python port `__ by gfxmonk -- `Decruft effort ` to move to lxml -- "BR to P" fix from readability.js which improves quality for smaller texts -- Github users contributions. diff --git a/requirements-dev.txt b/requirements-dev.txt index 9f580cb..bc876e5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1 +1,2 @@ -nose \ No newline at end of file +nose +twine \ No newline at end of file diff --git a/setup.py b/setup.py index 1819b41..a88e818 100755 --- a/setup.py +++ b/setup.py @@ -37,8 +37,8 @@ def find_version(*file_paths): author_email="burchik@gmail.com", description="fast html to text parser (article readability tool) with python 3 support", test_suite="tests.test_article_only", - long_description=open("README.rst").read(), - long_description_content_type='text/x-rst', + long_description=open("README.md").read(), + long_description_content_type="text/markdown", license="Apache License 2.0", url="http://github.com/buriy/python-readability", packages=["readability"], From 11c721d920c674a145e142e2d1a5bc11ea6278f9 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sun, 4 May 2025 03:57:01 +0700 Subject: [PATCH 25/27] Fix CJK title fix, added a test --- readability/encoding.py | 3 +-- readability/htmls.py | 11 ++++---- readability/readability.py | 10 ++++---- tests/test_article_only.py | 51 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 12 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index c95cc14..08332df 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,9 +1,8 @@ import re try: - import cchardet + import cchardet as chardet except ImportError: import chardet -import sys RE_CHARSET = re.compile(r']', flags=re.I) diff --git a/readability/htmls.py b/readability/htmls.py index b090aa5..d99a9f5 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -123,8 +123,8 @@ def shorten_title(doc): if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)): title = p0 break - elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): - title = p1 + elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)): + title = pl break else: if ": " in title: @@ -134,11 +134,12 @@ def shorten_title(doc): else: title = orig.split(": ", 1)[1] - if cjk.search(title) and not (4 <= len(title) < 100): - return orig + if cjk.search(title): + if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100 + return orig elif not 15 < len(title) < 150: return orig - + return title diff --git a/readability/readability.py b/readability/readability.py index 286841c..c573905 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -42,11 +42,11 @@ "divToPElementsRe": re.compile( r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I ), - #'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), - #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), - #'trimRe': re.compile(r'^\s+|\s+$/'), - #'normalizeRe': re.compile(r'\s{2,}/'), - #'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), + # 'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), + # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), + # 'trimRe': re.compile(r'^\s+|\s+$/'), + # 'normalizeRe': re.compile(r'\s{2,}/'), + # 'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I), # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } diff --git a/tests/test_article_only.py b/tests/test_article_only.py index d6cef52..fe32212 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -149,6 +149,7 @@ def test_utf8_kanji(self): sample = load_sample("utf-8-kanji.sample.html") doc = Document(sample) res = doc.summary() + assert 0 < len(res) < 10000 def test_author_present(self): sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html") @@ -180,3 +181,53 @@ def test_keep_images_absent_by_defautl(self): doc = Document(sample) assert " + + 这是标题 + + +
一些无关紧要的内容
+
+

主要文章标题

+

这是主要内容的第一段。

+

これはコンテンツの第2段落です。

+

이것은 콘텐츠의 세 번째 단락입니다.

+

This is the fourth paragraph.

+
+
More irrelevant stuff
+ + + """ + doc = Document(html) + summary = doc.summary() + # Check that the main CJK content is present in the summary + self.assertTrue("这是主要内容的第一段" in summary) + self.assertTrue("これはコンテンツの第2段落です" in summary) + self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary) + # Check that irrelevant content is mostly gone + self.assertFalse("一些无关紧要的内容" in summary) + + def test_shorten_title_delimiter_bug(self): + """Test that shorten_title handles delimiters correctly when the last part is valid. + + This specifically targets a potential bug where 'p1' might be used instead of 'pl'. + """ + html = """ + + + Short Part | これは長いです + + +
Content
+ + + """ + doc = Document(html) + # With the bug, this call might raise NameError: name 'p1' is not defined + # With the fix, it should correctly return the last part. + short_title = doc.short_title() + self.assertEqual(short_title, "これは長いです") From 72318f15af98e24a67c48b5abb646f6f2e3f109a Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sun, 4 May 2025 03:57:26 +0700 Subject: [PATCH 26/27] Fix poetry builds --- Makefile | 6 +++++- pyproject.toml | 10 ++++++++-- requirements-dev.txt | 3 ++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index ba14e4f..52502f3 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ all: setup develop venv: .venv/bin/python setup: venv - $(PIP) install -r requirements-dev.txt + $(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true .venv/bin/python: test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv @@ -45,6 +45,10 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link .PHONY: clean_all clean_all: clean_venv +.PHONY: build +build: + poetry build + # ########### # Deploy # ########### diff --git a/pyproject.toml b/pyproject.toml index 4dad46a..cff93b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,18 @@ version = "0.8.4" description = "fast html to text parser (article readability tool) with python 3 support" authors = ["Yuri Baburov "] license = "Apache License 2.0" -readme = "README.rst" +readme = "README.md" +packages = [ + { include = "readability" }, +] [tool.poetry.dependencies] python = ">=3.8.2,<3.14" chardet = "^5.2.0" -cssselect = "~1.2" +cssselect = [ + { version = "~1.2", markers = "python_version < '3.9'" }, + { version = "~1.3", markers = "python_version >= '3.9'" } +] lxml = {extras = ["html-clean"], version = "^5.4.0"} lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"} diff --git a/requirements-dev.txt b/requirements-dev.txt index bc876e5..996bbfc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,3 @@ nose -twine \ No newline at end of file +twine +flake8 \ No newline at end of file From c8d8011f3d4c69d7667a52395237e56e66af8ea4 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sun, 4 May 2025 04:09:54 +0700 Subject: [PATCH 27/27] Bump version to 0.8.4.1 --- .flake8 | 2 ++ Makefile | 6 ++++++ pyproject.toml | 2 +- readability/__init__.py | 2 +- 4 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..b33811f --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E501, W503 \ No newline at end of file diff --git a/Makefile b/Makefile index 52502f3..9caf08a 100644 --- a/Makefile +++ b/Makefile @@ -65,3 +65,9 @@ upload: .PHONY: bump bump: $(EDITOR) readability/__init__.py + $(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2)) + # fix first occurrence of version in pyproject.toml + sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml + git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py + git tag $(VERSION) + git push --tags diff --git a/pyproject.toml b/pyproject.toml index cff93b1..4499285 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "readability-lxml" -version = "0.8.4" +version = "0.8.4.1" description = "fast html to text parser (article readability tool) with python 3 support" authors = ["Yuri Baburov "] license = "Apache License 2.0" diff --git a/readability/__init__.py b/readability/__init__.py index f27111b..b36f021 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.4" +__version__ = "0.8.4.1" from .readability import Document