diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..b33811f1 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E501, W503 \ No newline at end of file diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..23f16106 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/.gitignore b/.gitignore index d8961065..b532e65e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ nosetests.xml .idea .cache /.noseids -/.venv \ No newline at end of file +/.venv +/poetry.lock \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 21e1ce11..ea56f519 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,47 +4,16 @@ cache: pip matrix: include: - - name: "Python 2.7 on Linux" - python: 2.7 - env: PIP=pip - - name: "Python 3.5 on Linux" - python: 3.5 - - name: "Python 3.6 on Linux" - python: 3.6 - - name: "Python 3.7 on Linux" - python: 3.7 - name: "Python 3.8 on Linux" dist: xenial python: 3.8 - name: "Python 3.9 Nightly on Linux" dist: bionic python: nightly - - name: "Pypy on Linux" - python: pypy - env: PIP=pip - name: "Pypy 3 on Linux" python: pypy3 - - name: "Python 3.7 on older macOS" - os: osx - osx_image: xcode9.4 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version - - name: "Python 3.7 on macOS" - os: osx - osx_image: xcode11 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version allow_failures: - python: nightly - - python: pypy - python: pypy3 - os: osx diff --git a/Makefile b/Makefile index 012e4b78..9caf08a5 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ PY := .venv/bin/python PIP := .venv/bin/pip PEP8 := .venv/bin/pep8 NOSE := .venv/bin/nosetests -TWINE := twine +TWINE := .venv/bin/twine # ########### # Tests rule! @@ -24,7 +24,7 @@ all: setup develop venv: .venv/bin/python setup: venv - $(PIP) install -r requirements-dev.txt + $(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true .venv/bin/python: test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv @@ -45,11 +45,16 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link .PHONY: clean_all clean_all: clean_venv +.PHONY: build +build: + poetry build + # ########### # Deploy # ########### .PHONY: dist dist: + $(PY) -m pip install wheel $(PY) setup.py sdist bdist_wheel $(TWINE) check dist/* @@ -57,6 +62,12 @@ dist: upload: $(TWINE) upload dist/* -.PHONY: version_update -version_update: - $(EDITOR) setup.py +.PHONY: bump +bump: + $(EDITOR) readability/__init__.py + $(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2)) + # fix first occurrence of version in pyproject.toml + sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml + git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py + git tag $(VERSION) + git push --tags diff --git a/README.md b/README.md new file mode 100644 index 00000000..e09a515a --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml) + +# python-readability + +Given an HTML document, extract and clean up the main body text and title. + +This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/). + +## Installation + +It's easy using `pip`, just run: + +```bash +$ pip install readability-lxml +``` + +As an alternative, you may also use conda to install, just run: + +```bash +$ conda install -c conda-forge readability-lxml +``` + +## Usage + +```python +>>> import requests +>>> from readability import Document + +>>> response = requests.get('http://example.com') +>>> doc = Document(response.content) +>>> doc.title() +'Example Domain' + +>>> doc.summary() +"""
\n
\n

Example Domain

\n +

This domain is established to be used for illustrative examples in documents. You may +use this\n domain in examples without prior coordination or asking for permission.

+\n

More information...

\n
+\n\n
""" +``` + +## Change Log +- 0.8.4 Better CJK support, thanks @cdhigh +- 0.8.3.1 Support for python 3.8 - 3.13 +- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev +- 0.8.2 Added article author(s) (thanks @mattblaha) +- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. +- 0.8 Replaced XHTML output with HTML5 output in summary() call. +- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. +- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). +- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords + +## Licensing + +This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license. + +## Thanks to + +- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js) +- Ruby port by starrhorne and iterationlabs +- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk +- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml +- "BR to P" fix from readability.js which improves quality for smaller texts +- Github users contributions. diff --git a/README.rst b/README.rst deleted file mode 100644 index 9b0a8b71..00000000 --- a/README.rst +++ /dev/null @@ -1,76 +0,0 @@ -.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master - :target: https://travis-ci.org/buriy/python-readability -.. image:: https://img.shields.io/pypi/v/readability-lxml.svg - :target: https://pypi.python.org/pypi/readability-lxml - -python-readability -================== - -Given an HTML document, extract and clean up the main body text and title. - -This is a Python port of a Ruby port of `arc90's Readability -project `__. - -Installation ------------- - -It's easy using ``pip``, just run: - -.. code-block:: bash - - $ pip install readability-lxml - -As an alternative, you may also use conda to install, just run: - -.. code-block:: bash - - $ conda install -c conda-forge readability-lxml - -Usage ------ - -.. code-block:: python - - >>> import requests - >>> from readability import Document - - >>> response = requests.get('http://example.com') - >>> doc = Document(response.content) - >>> doc.title() - 'Example Domain' - - >>> doc.summary() - """
\n
\n

Example Domain

\n -

This domain is established to be used for illustrative examples in documents. You may - use this\n domain in examples without prior coordination or asking for permission.

- \n

More information...

\n
- \n\n
""" - -Change Log ----------- - -- 0.8.2 Added article author(s) (thanks @mattblaha) -- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. -- 0.8 Replaced XHTML output with HTML5 output in summary() call. -- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. -- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). -- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords - -Licensing ---------- - -This code is under `the Apache License -2.0 `__ license. - -Thanks to ---------- - -- Latest `readability.js `__ -- Ruby port by starrhorne and iterationlabs -- `Python port `__ by gfxmonk -- `Decruft effort ` to move to lxml -- "BR to P" fix from readability.js which improves quality for smaller texts -- Github users contributions. diff --git a/doc/source/conf.py b/doc/source/conf.py index bb261349..e70cf9b3 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # readability documentation build configuration file, created by # sphinx-quickstart on Thu Mar 23 16:29:38 2017. @@ -38,7 +37,7 @@ "sphinx.ext.doctest", "sphinx.ext.intersphinx", "sphinx.ext.todo", - "recommonmark", + "myst_parser", ] # Add any paths that contain templates here, relative to this directory. @@ -72,7 +71,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..44992853 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.poetry] +name = "readability-lxml" +version = "0.8.4.1" +description = "fast html to text parser (article readability tool) with python 3 support" +authors = ["Yuri Baburov "] +license = "Apache License 2.0" +readme = "README.md" +packages = [ + { include = "readability" }, +] + +[tool.poetry.dependencies] +python = ">=3.8.2,<3.14" +chardet = "^5.2.0" +cssselect = [ + { version = "~1.2", markers = "python_version < '3.9'" }, + { version = "~1.3", markers = "python_version >= '3.9'" } +] +lxml = {extras = ["html-clean"], version = "^5.4.0"} +lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"} + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/readability/__init__.py b/readability/__init__.py index 6a263bf7..b36f021d 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.2" +__version__ = "0.8.4.1" from .readability import Document diff --git a/readability/cleaners.py b/readability/cleaners.py index 69825c6b..e0b07260 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -1,6 +1,9 @@ # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds import re -from lxml.html.clean import Cleaner +try: + from lxml.html.clean import Cleaner +except ImportError: + from lxml_html_clean import Cleaner bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"] single_quoted = "'[^']+'" diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py deleted file mode 100644 index caf0ea8f..00000000 --- a/readability/compat/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -This module contains compatibility helpers for Python 2/3 interoperability. - -It mainly exists because their are certain incompatibilities in the Python -syntax that can only be solved by conditionally importing different functions. -""" -import sys -from lxml.etree import tostring - -if sys.version_info[0] == 2: - bytes_ = str - str_ = unicode - def tostring_(s): - return tostring(s, encoding='utf-8').decode('utf-8') - -elif sys.version_info[0] == 3: - bytes_ = bytes - str_ = str - def tostring_(s): - return tostring(s, encoding='utf-8') - - -try: - from re import Pattern as pattern_type -except ImportError: - from re import _pattern_type as pattern_type diff --git a/readability/compat/three.py b/readability/compat/three.py deleted file mode 100644 index 26351575..00000000 --- a/readability/compat/three.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs).with_traceback(traceback) diff --git a/readability/compat/two.py b/readability/compat/two.py deleted file mode 100644 index 642ecb75..00000000 --- a/readability/compat/two.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs), None, traceback diff --git a/readability/encoding.py b/readability/encoding.py index 212ff929..08332df0 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,9 +1,8 @@ import re try: - import cchardet + import cchardet as chardet except ImportError: import chardet -import sys RE_CHARSET = re.compile(r']', flags=re.I) @@ -39,11 +38,10 @@ def get_encoding(page): for declared_encoding in declared_encodings: try: # Python3 only - if sys.version_info[0] == 3: - # declared_encoding will actually be bytes but .decode() only - # accepts `str` type. Decode blindly with ascii because no one should - # ever use non-ascii characters in the name of an encoding. - declared_encoding = declared_encoding.decode("ascii", "replace") + # declared_encoding will actually be bytes but .decode() only + # accepts `str` type. Decode blindly with ascii because no one should + # ever use non-ascii characters in the name of an encoding. + declared_encoding = declared_encoding.decode("ascii", "replace") encoding = fix_charset(declared_encoding) # Now let's decode the page diff --git a/readability/htmls.py b/readability/htmls.py index acacb5ab..d99a9f53 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -4,13 +4,12 @@ from .cleaners import normalize_spaces, clean_attributes from .encoding import get_encoding -from .compat import str_ utf8_parser = lxml.html.HTMLParser(encoding="utf-8") def build_doc(page): - if isinstance(page, str_): + if isinstance(page, str): encoding = None decoded_page = page else: @@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl): def normalize_entities(cur_title): entities = { - u"\u2014": "-", - u"\u2013": "-", - u"—": "-", - u"–": "-", - u"\u00A0": " ", - u"\u00AB": '"', - u"\u00BB": '"', - u""": '"', + "\u2014": "-", + "\u2013": "-", + "—": "-", + "–": "-", + "\u00A0": " ", + "\u00AB": '"', + "\u00BB": '"', + """: '"', } for c, r in entities.items(): if c in cur_title: @@ -111,27 +110,34 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) + cjk = re.compile('[\u4e00-\u9fff]+') + if candidates: title = sorted(candidates, key=len)[-1] else: for delimiter in [" | ", " - ", " :: ", " / "]: if delimiter in title: parts = orig.split(delimiter) - if len(parts[0].split()) >= 4: - title = parts[0] + p0 = parts[0] + pl = parts[-1] + if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)): + title = p0 break - elif len(parts[-1].split()) >= 4: - title = parts[-1] + elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)): + title = pl break else: if ": " in title: - parts = orig.split(": ") - if len(parts[-1].split()) >= 4: - title = parts[-1] + p1 = orig.split(": ")[-1] + if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 else: title = orig.split(": ", 1)[1] - if not 15 < len(title) < 150: + if cjk.search(title): + if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100 + return orig + elif not 15 < len(title) < 150: return orig return title diff --git a/readability/readability.py b/readability/readability.py index f16b170a..c5739056 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,9 +1,12 @@ #!/usr/bin/env python -from __future__ import print_function import logging import re import sys +import urllib.request +import urllib.parse +import urllib.error +from lxml.etree import tostring from lxml.etree import tounicode from lxml.etree import _ElementTree from lxml.html import document_fromstring @@ -17,7 +20,6 @@ from .htmls import get_title from .htmls import get_author from .htmls import shorten_title -from .compat import str_, bytes_, tostring_, pattern_type from .debug import describe, text_content @@ -40,11 +42,11 @@ "divToPElementsRe": re.compile( r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I ), - #'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), - #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), - #'trimRe': re.compile(r'^\s+|\s+$/'), - #'normalizeRe': re.compile(r'\s{2,}/'), - #'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), + # 'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), + # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), + # 'trimRe': re.compile(r'^\s+|\s+$/'), + # 'normalizeRe': re.compile(r'\s{2,}/'), + # 'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I), # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } @@ -80,16 +82,16 @@ def text_length(i): def compile_pattern(elements): if not elements: return None - elif isinstance(elements, pattern_type): + elif isinstance(elements, re.Pattern): return elements - elif isinstance(elements, (str_, bytes_)): - if isinstance(elements, bytes_): - elements = str_(elements, "utf-8") - elements = elements.split(u",") + elif isinstance(elements, (str, bytes)): + if isinstance(elements, bytes): + elements = str(elements, "utf-8") + elements = elements.split(",") if isinstance(elements, (list, tuple)): - return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U) + return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U) else: - raise Exception("Unknown type for the pattern: {}".format(type(elements))) + raise Exception(f"Unknown type for the pattern: {type(elements)}") # assume string or string like object @@ -208,12 +210,13 @@ def get_clean_html(self): """ return clean_attributes(tounicode(self.html, method="html")) - def summary(self, html_partial=False): + def summary(self, html_partial=False, keep_all_images=False): """ Given a HTML file, extracts the text of the article. :param html_partial: return only the div of the document, don't wrap in html and body tags. + :param keep_all_images: Keep all images in summary. Warning: It mutates internal DOM representation of the HTML document, so it is better to call other API methods before this one. @@ -242,24 +245,20 @@ def summary(self, html_partial=False): log.info("ruthless removal did not work. ") ruthless = False log.debug( - ( "ended up stripping too much - " "going for a safer _parse" - ) ) # try again continue else: log.debug( - ( "Ruthless and lenient parsing did not work. " "Returning raw html" - ) ) article = self.html.find("body") if article is None: article = self.html - cleaned_article = self.sanitize(article, candidates) + cleaned_article = self.sanitize(article, candidates, keep_all_images) article_length = len(cleaned_article or "") retry_length = self.retry_length @@ -272,11 +271,7 @@ def summary(self, html_partial=False): return cleaned_article except Exception as e: log.exception("error getting summary: ") - if sys.version_info[0] == 2: - from .compat.two import raise_with_traceback - else: - from .compat.three import raise_with_traceback - raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e)) + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -338,7 +333,7 @@ def select_best_candidate(self, candidates): ) for candidate in sorted_candidates[:5]: elem = candidate["elem"] - log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem))) + log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem))) best_candidate = sorted_candidates[0] return best_candidate @@ -454,7 +449,7 @@ def score_node(self, elem): def remove_unlikely_candidates(self): for elem in self.html.findall(".//*"): - s = "%s %s" % (elem.get("class", ""), elem.get("id", "")) + s = "{} {}".format(elem.get("class", ""), elem.get("id", "")) if len(s) < 2: continue if ( @@ -474,7 +469,8 @@ def transform_misused_divs_into_paragraphs(self): # This results in incorrect results in case there is an # buried within an for example if not REGEXES["divToPElementsRe"].search( - str_(b"".join(map(tostring_, list(elem)))) + str(b"".join(tostring(s, encoding='utf-8') for s in elem)) + # str(b"".join(map(tostring_, list(elem)))) ): # log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" @@ -501,15 +497,13 @@ def transform_misused_divs_into_paragraphs(self): def tags(self, node, *tag_names): for tag_name in tag_names: - for e in node.findall(".//%s" % tag_name): - yield e + yield from node.findall(".//%s" % tag_name) def reverse_tags(self, node, *tag_names): for tag_name in tag_names: - for e in reversed(node.findall(".//%s" % tag_name)): - yield e + yield from reversed(node.findall(".//%s" % tag_name)) - def sanitize(self, node, candidates): + def sanitize(self, node, candidates, keep_all_images=False): MIN_LEN = self.min_text_length for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: @@ -570,8 +564,8 @@ def sanitize(self, node, candidates): to_remove = False reason = "" - # if el.tag == 'div' and counts["img"] >= 1: - # continue + if keep_all_images and el.tag == 'div' and counts["img"] >= 1: + continue if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: reason = "too many images (%s)" % counts["img"] to_remove = True @@ -594,13 +588,13 @@ def sanitize(self, node, candidates): ) to_remove = True elif weight < 25 and link_density > 0.2: - reason = "too many links %.3f for its weight %s" % ( + reason = "too many links {:.3f} for its weight {}".format( link_density, weight, ) to_remove = True elif weight >= 25 and link_density > 0.5: - reason = "too many links %.3f for its weight %s" % ( + reason = "too many links {:.3f} for its weight {}".format( link_density, weight, ) @@ -726,18 +720,10 @@ def main(): file = None if options.url: headers = {"User-Agent": "Mozilla/5.0"} - if sys.version_info[0] == 3: - import urllib.request, urllib.parse, urllib.error - - request = urllib.request.Request(options.url, None, headers) - file = urllib.request.urlopen(request) - else: - import urllib2 - - request = urllib2.Request(options.url, None, headers) - file = urllib2.urlopen(request) + request = urllib.request.Request(options.url, None, headers) + file = urllib.request.urlopen(request) else: - file = open(args[0], "rt") + file = open(args[0]) try: doc = Document( file.read(), @@ -751,14 +737,8 @@ def main(): result = "

" + doc.short_title() + "


" + doc.summary() open_in_browser(result) else: - enc = ( - sys.__stdout__.encoding or "utf-8" - ) # XXX: this hack could not always work, better to set PYTHONIOENCODING result = "Title:" + doc.short_title() + "\n" + doc.summary() - if sys.version_info[0] == 3: - print(result) - else: - print(result.encode(enc, "replace")) + print(result) finally: file.close() diff --git a/requirements-dev.txt b/requirements-dev.txt index 4731fa9d..996bbfc0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,3 @@ -lxml -chardet nose -pep8 -coverage -wrapt-timeout-decorator +twine +flake8 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d6e1198b..00000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ --e . diff --git a/setup.py b/setup.py index 2770abef..a88e8185 100755 --- a/setup.py +++ b/setup.py @@ -1,34 +1,16 @@ #!/usr/bin/env python -from __future__ import print_function import codecs import os import re from setuptools import setup -import sys - -lxml_requirement = "lxml" -if sys.platform == "darwin": - import platform - - mac_ver = platform.mac_ver()[0] - mac_major, mac_minor = mac_ver.split('.')[:2] - if int(mac_major) == 10 and int(mac_minor) < 9: - print("Using lxml<2.4") - lxml_requirement = "lxml<2.4" speed_deps = [ "cchardet", ] -test_deps = [ - # Test timeouts - "wrapt-timeout-decorator", -] - extras = { 'speed': speed_deps, - 'test': test_deps, } # Adapted from https://github.com/pypa/pip/blob/master/setup.py @@ -55,13 +37,17 @@ def find_version(*file_paths): author_email="burchik@gmail.com", description="fast html to text parser (article readability tool) with python 3 support", test_suite="tests.test_article_only", - long_description=open("README.rst").read(), - long_description_content_type='text/x-rst', + long_description=open("README.md").read(), + long_description_content_type="text/markdown", license="Apache License 2.0", url="http://github.com/buriy/python-readability", - packages=["readability", "readability.compat"], - install_requires=["chardet", lxml_requirement, "cssselect"], - tests_require=test_deps, + packages=["readability"], + install_requires=[ + "chardet", + "lxml[html_clean]", + "lxml-html-clean; python_version < '3.11'", + "cssselect" + ], extras_require=extras, classifiers=[ "Environment :: Web Environment", @@ -72,12 +58,13 @@ def find_version(*file_paths): "Topic :: Internet", "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: PyPy", ], ) diff --git a/tests/samples/summary-keep-all-images.sample.html b/tests/samples/summary-keep-all-images.sample.html new file mode 100644 index 00000000..127683fc --- /dev/null +++ b/tests/samples/summary-keep-all-images.sample.html @@ -0,0 +1,29 @@ + + + + +

+ + H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline + +

+

+ + Text Text Text Text Text Text Text Text Text Text + +

+
+

+ + Text Text Text Text Text Text Text Text Text Text + +

+ + \ No newline at end of file diff --git a/tests/test_article_only.py b/tests/test_article_only.py index c5592cfb..fe322121 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -1,8 +1,34 @@ import os +import time import unittest from readability import Document -from wrapt_timeout_decorator import * +from functools import wraps + + +class TimeoutException(Exception): + """Exception raised when a function exceeds its time limit.""" + pass + + +def timeout(seconds): + """Decorator to enforce a timeout on function execution.""" + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + elapsed_time = end_time - start_time + if elapsed_time > seconds: + raise TimeoutException( + f"Function '{func.__name__}' exceeded time limit of {seconds} seconds " + f"with an execution time of {elapsed_time:.4f} seconds" + ) + return result + return wrapper + return decorator + SAMPLES = os.path.join(os.path.dirname(__file__), "samples") @@ -100,7 +126,7 @@ def test_correct_cleanup(self): assert not "aside" in s # Many spaces make some regexes run forever - @timeout(3, use_signals=False) + @timeout(3) def test_many_repeated_spaces(self): long_space = " " * 1000000 sample = "

foo" + long_space + "

" @@ -123,6 +149,7 @@ def test_utf8_kanji(self): sample = load_sample("utf-8-kanji.sample.html") doc = Document(sample) res = doc.summary() + assert 0 < len(res) < 10000 def test_author_present(self): sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html") @@ -133,3 +160,74 @@ def test_author_absent(self): sample = load_sample("si-game.sample.html") doc = Document(sample) assert '[no-author]' == doc.author() + + def test_keep_images_present(self): + sample = load_sample("summary-keep-all-images.sample.html") + + doc = Document(sample) + + assert " + + 这是标题 + + +
一些无关紧要的内容
+
+

主要文章标题

+

这是主要内容的第一段。

+

これはコンテンツの第2段落です。

+

이것은 콘텐츠의 세 번째 단락입니다.

+

This is the fourth paragraph.

+
+
More irrelevant stuff
+ + + """ + doc = Document(html) + summary = doc.summary() + # Check that the main CJK content is present in the summary + self.assertTrue("这是主要内容的第一段" in summary) + self.assertTrue("これはコンテンツの第2段落です" in summary) + self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary) + # Check that irrelevant content is mostly gone + self.assertFalse("一些无关紧要的内容" in summary) + + def test_shorten_title_delimiter_bug(self): + """Test that shorten_title handles delimiters correctly when the last part is valid. + + This specifically targets a potential bug where 'p1' might be used instead of 'pl'. + """ + html = """ + + + Short Part | これは長いです + + +
Content
+ + + """ + doc = Document(html) + # With the bug, this call might raise NameError: name 'p1' is not defined + # With the fix, it should correctly return the last part. + short_title = doc.short_title() + self.assertEqual(short_title, "これは長いです") diff --git a/tox.ini b/tox.ini index d6954339..926fda50 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{27,35,36,37,38,39,310,py,py3}, doc + py{38,39,310,311,312,313,py3}, doc skip_missing_interpreters = True @@ -14,7 +14,7 @@ deps = pytest doc: sphinx doc: sphinx_rtd_theme - doc: recommonmark + doc: myst-parser # This creates the virtual envs with --site-packages so already packages # that are already installed will be reused. This is especially useful on @@ -30,4 +30,4 @@ commands = [testenv:doc] commands = - python setup.py build_sphinx + sphinx-build -b html doc/source/ build/