diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..b33811f1 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E501, W503 \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..80224f9e --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +tests/samples/* linguist-vendored diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..23f16106 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..bdaab28a --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,39 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index 16a2c86e..b532e65e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ *.pyc +__pycache__ *.egg-info -build -dist +/build +/dist /bin /include /lib @@ -12,3 +13,6 @@ nosetests.xml .tox .idea .cache +/.noseids +/.venv +/poetry.lock \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 21e1ce11..ea56f519 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,47 +4,16 @@ cache: pip matrix: include: - - name: "Python 2.7 on Linux" - python: 2.7 - env: PIP=pip - - name: "Python 3.5 on Linux" - python: 3.5 - - name: "Python 3.6 on Linux" - python: 3.6 - - name: "Python 3.7 on Linux" - python: 3.7 - name: "Python 3.8 on Linux" dist: xenial python: 3.8 - name: "Python 3.9 Nightly on Linux" dist: bionic python: nightly - - name: "Pypy on Linux" - python: pypy - env: PIP=pip - name: "Pypy 3 on Linux" python: pypy3 - - name: "Python 3.7 on older macOS" - os: osx - osx_image: xcode9.4 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version - - name: "Python 3.7 on macOS" - os: osx - osx_image: xcode11 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version allow_failures: - python: nightly - - python: pypy - python: pypy3 - os: osx diff --git a/Makefile b/Makefile index 81a14523..9caf08a5 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ PY := .venv/bin/python PIP := .venv/bin/pip PEP8 := .venv/bin/pep8 NOSE := .venv/bin/nosetests +TWINE := .venv/bin/twine # ########### # Tests rule! @@ -12,22 +13,24 @@ NOSE := .venv/bin/nosetests test: venv develop $(NOSE) $(NOSE) --with-id -s tests -$(NOSE): - $(PIP) install nose pep8 coverage +$(NOSE): setup # ####### # INSTALL # ####### .PHONY: all -all: venv develop +all: setup develop venv: .venv/bin/python +setup: venv + $(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true + .venv/bin/python: - virtualenv .venv + test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv -.PHONY: clean_venv -clean_venv: +.PHONY: clean +clean: rm -rf .venv develop: .venv/lib/python*/site-packages/readability-lxml.egg-link @@ -42,18 +45,29 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link .PHONY: clean_all clean_all: clean_venv +.PHONY: build +build: + poetry build # ########### # Deploy # ########### .PHONY: dist dist: - $(PY) setup.py sdist + $(PY) -m pip install wheel + $(PY) setup.py sdist bdist_wheel + $(TWINE) check dist/* .PHONY: upload upload: - $(PY) setup.py sdist upload + $(TWINE) upload dist/* -.PHONY: version_update -version_update: - $(EDITOR) setup.py +.PHONY: bump +bump: + $(EDITOR) readability/__init__.py + $(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2)) + # fix first occurrence of version in pyproject.toml + sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml + git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py + git tag $(VERSION) + git push --tags diff --git a/README.md b/README.md new file mode 100644 index 00000000..e09a515a --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml) + +# python-readability + +Given an HTML document, extract and clean up the main body text and title. + +This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/). + +## Installation + +It's easy using `pip`, just run: + +```bash +$ pip install readability-lxml +``` + +As an alternative, you may also use conda to install, just run: + +```bash +$ conda install -c conda-forge readability-lxml +``` + +## Usage + +```python +>>> import requests +>>> from readability import Document + +>>> response = requests.get('http://example.com') +>>> doc = Document(response.content) +>>> doc.title() +'Example Domain' + +>>> doc.summary() +"""
\n
\n

Example Domain

\n +

This domain is established to be used for illustrative examples in documents. You may +use this\n domain in examples without prior coordination or asking for permission.

+\n

More information...

\n
+\n\n
""" +``` + +## Change Log +- 0.8.4 Better CJK support, thanks @cdhigh +- 0.8.3.1 Support for python 3.8 - 3.13 +- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev +- 0.8.2 Added article author(s) (thanks @mattblaha) +- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. +- 0.8 Replaced XHTML output with HTML5 output in summary() call. +- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. +- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). +- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords + +## Licensing + +This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license. + +## Thanks to + +- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js) +- Ruby port by starrhorne and iterationlabs +- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk +- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml +- "BR to P" fix from readability.js which improves quality for smaller texts +- Github users contributions. diff --git a/README.rst b/README.rst deleted file mode 100644 index d4150aca..00000000 --- a/README.rst +++ /dev/null @@ -1,68 +0,0 @@ -.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master - :target: https://travis-ci.org/buriy/python-readability - - -python-readability -================== - -Given a html document, it pulls out the main body text and cleans it up. - -This is a python port of a ruby port of `arc90's readability -project `__. - -Installation ------------- - -It's easy using ``pip``, just run: - -.. code-block:: bash - - $ pip install readability-lxml - -Usage ------ - -.. code-block:: python - - >>> import requests - >>> from readability import Document - - >>> response = requests.get('http://example.com') - >>> doc = Document(response.text) - >>> doc.title() - 'Example Domain' - - >>> doc.summary() - """
\n
\n

Example Domain

\n -

This domain is established to be used for illustrative examples in documents. You may - use this\n domain in examples without prior coordination or asking for permission.

- \n

More information...

\n
- \n\n
""" - -Change Log ----------- - -- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. -- 0.8 Replaced XHTML output with HTML5 output in summary() call. -- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. -- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). -- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords - -Licensing --------- - -This code is under `the Apache License -2.0 `__ license. - -Thanks to ---------- - -- Latest `readability.js `__ -- Ruby port by starrhorne and iterationlabs -- `Python port `__ by gfxmonk -- `Decruft effort ` to move to lxml -- "BR to P" fix from readability.js which improves quality for smaller texts -- Github users contributions. diff --git a/doc/source/conf.py b/doc/source/conf.py index bb261349..e70cf9b3 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # readability documentation build configuration file, created by # sphinx-quickstart on Thu Mar 23 16:29:38 2017. @@ -38,7 +37,7 @@ "sphinx.ext.doctest", "sphinx.ext.intersphinx", "sphinx.ext.todo", - "recommonmark", + "myst_parser", ] # Add any paths that contain templates here, relative to this directory. @@ -72,7 +71,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..44992853 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.poetry] +name = "readability-lxml" +version = "0.8.4.1" +description = "fast html to text parser (article readability tool) with python 3 support" +authors = ["Yuri Baburov "] +license = "Apache License 2.0" +readme = "README.md" +packages = [ + { include = "readability" }, +] + +[tool.poetry.dependencies] +python = ">=3.8.2,<3.14" +chardet = "^5.2.0" +cssselect = [ + { version = "~1.2", markers = "python_version < '3.9'" }, + { version = "~1.3", markers = "python_version >= '3.9'" } +] +lxml = {extras = ["html-clean"], version = "^5.4.0"} +lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"} + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/readability/__init__.py b/readability/__init__.py index 32e28e3b..b36f021d 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.1" +__version__ = "0.8.4.1" from .readability import Document diff --git a/readability/cleaners.py b/readability/cleaners.py index 69825c6b..e0b07260 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -1,6 +1,9 @@ # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds import re -from lxml.html.clean import Cleaner +try: + from lxml.html.clean import Cleaner +except ImportError: + from lxml_html_clean import Cleaner bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"] single_quoted = "'[^']+'" diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py deleted file mode 100644 index c648633a..00000000 --- a/readability/compat/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -This module contains compatibility helpers for Python 2/3 interoperability. - -It mainly exists because their are certain incompatibilities in the Python -syntax that can only be solved by conditionally importing different functions. -""" -import sys -from lxml.etree import tostring - -if sys.version_info[0] == 2: - bytes_ = str - str_ = unicode - def tostring_(s): - return tostring(s, encoding='utf-8').decode('utf-8') - -elif sys.version_info[0] == 3: - bytes_ = bytes - str_ = str - def tostring_(s): - return tostring(s, encoding='utf-8') diff --git a/readability/compat/three.py b/readability/compat/three.py deleted file mode 100644 index 26351575..00000000 --- a/readability/compat/three.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs).with_traceback(traceback) diff --git a/readability/compat/two.py b/readability/compat/two.py deleted file mode 100644 index 642ecb75..00000000 --- a/readability/compat/two.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs), None, traceback diff --git a/readability/encoding.py b/readability/encoding.py index e915866a..08332df0 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,11 +1,13 @@ import re -import chardet -import sys +try: + import cchardet as chardet +except ImportError: + import chardet -RE_CHARSET = re.compile(br']', flags=re.I) -RE_PRAGMA = re.compile(br']', flags=re.I) -RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]') +RE_CHARSET = re.compile(r']', flags=re.I) +RE_PRAGMA = re.compile(r']', flags=re.I) +RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') CHARSETS = { "big5": "big5hkscs", @@ -35,25 +37,24 @@ def get_encoding(page): # Try any declared encodings for declared_encoding in declared_encodings: try: - if sys.version_info[0] == 3: - # declared_encoding will actually be bytes but .decode() only - # accepts `str` type. Decode blindly with ascii because no one should - # ever use non-ascii characters in the name of an encoding. - declared_encoding = declared_encoding.decode("ascii", "replace") + # Python3 only + # declared_encoding will actually be bytes but .decode() only + # accepts `str` type. Decode blindly with ascii because no one should + # ever use non-ascii characters in the name of an encoding. + declared_encoding = declared_encoding.decode("ascii", "replace") encoding = fix_charset(declared_encoding) - # Now let's decode the page page.decode(encoding) # It worked! return encoding - except (UnicodeDecodeError, LookupError): + except UnicodeDecodeError: pass # Fallback to chardet if declared encodings fail # Remove all HTML tags, and leave only text for chardet - text = re.sub(br"(\s*]*>)+\s*", b" ", page).strip() - enc = "utf-8" + text = re.sub(r'(\s*]*>)+\s*', ' ', page).strip() + enc = 'utf-8' if len(text) < 10: return enc # can't guess res = chardet.detect(text) diff --git a/readability/htmls.py b/readability/htmls.py index 17a75c7d..d99a9f53 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -4,13 +4,12 @@ from .cleaners import normalize_spaces, clean_attributes from .encoding import get_encoding -from .compat import str_ utf8_parser = lxml.html.HTMLParser(encoding="utf-8") def build_doc(page): - if isinstance(page, str_): + if isinstance(page, str): encoding = None decoded_page = page else: @@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl): def normalize_entities(cur_title): entities = { - u"\u2014": "-", - u"\u2013": "-", - u"—": "-", - u"–": "-", - u"\u00A0": " ", - u"\u00AB": '"', - u"\u00BB": '"', - u""": '"', + "\u2014": "-", + "\u2013": "-", + "—": "-", + "–": "-", + "\u00A0": " ", + "\u00AB": '"', + "\u00BB": '"', + """: '"', } for c, r in entities.items(): if c in cur_title: @@ -58,6 +57,15 @@ def get_title(doc): return norm_title(title.text) +def get_author(doc): + author = doc.find(".//meta[@name='author']") + if author is None or 'content' not in author.keys() or \ + len(author.get('content')) == 0: + return "[no-author]" + + return author.get('content') + + def add_match(collection, text, orig): text = norm_title(text) if len(text.split()) >= 2 and len(text) >= 15: @@ -102,27 +110,34 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) + cjk = re.compile('[\u4e00-\u9fff]+') + if candidates: title = sorted(candidates, key=len)[-1] else: for delimiter in [" | ", " - ", " :: ", " / "]: if delimiter in title: parts = orig.split(delimiter) - if len(parts[0].split()) >= 4: - title = parts[0] + p0 = parts[0] + pl = parts[-1] + if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)): + title = p0 break - elif len(parts[-1].split()) >= 4: - title = parts[-1] + elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)): + title = pl break else: if ": " in title: - parts = orig.split(": ") - if len(parts[-1].split()) >= 4: - title = parts[-1] + p1 = orig.split(": ")[-1] + if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 else: title = orig.split(": ", 1)[1] - if not 15 < len(title) < 150: + if cjk.search(title): + if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100 + return orig + elif not 15 < len(title) < 150: return orig return title @@ -134,7 +149,9 @@ def get_body(doc): elem.drop_tree() # tostring() always return utf-8 encoded string # FIXME: isn't better to use tounicode? - raw_html = str_(tostring(doc.body or doc)) + raw_html = tostring(doc.body or doc) + if isinstance(raw_html, bytes): + raw_html = raw_html.decode() cleaned = clean_attributes(raw_html) try: # BeautifulSoup(cleaned) #FIXME do we really need to try loading it? diff --git a/readability/readability.py b/readability/readability.py index 7a7d9b3d..c5739056 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,20 +1,25 @@ #!/usr/bin/env python -from __future__ import print_function import logging import re import sys +import urllib.request +import urllib.parse +import urllib.error +from lxml.etree import tostring from lxml.etree import tounicode +from lxml.etree import _ElementTree from lxml.html import document_fromstring from lxml.html import fragment_fromstring +from lxml.html import HtmlElement from .cleaners import clean_attributes from .cleaners import html_cleaner from .htmls import build_doc from .htmls import get_body from .htmls import get_title +from .htmls import get_author from .htmls import shorten_title -from .compat import str_, bytes_, tostring_ from .debug import describe, text_content @@ -37,11 +42,11 @@ "divToPElementsRe": re.compile( r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I ), - #'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), - #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), - #'trimRe': re.compile(r'^\s+|\s+$/'), - #'normalizeRe': re.compile(r'\s{2,}/'), - #'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), + # 'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), + # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), + # 'trimRe': re.compile(r'^\s+|\s+$/'), + # 'normalizeRe': re.compile(r'\s{2,}/'), + # 'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I), # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } @@ -77,16 +82,16 @@ def text_length(i): def compile_pattern(elements): if not elements: return None - elif isinstance(elements, re._pattern_type): + elif isinstance(elements, re.Pattern): return elements - elif isinstance(elements, (str_, bytes_)): - if isinstance(elements, bytes_): - elements = str_(elements, "utf-8") - elements = elements.split(u",") + elif isinstance(elements, (str, bytes)): + if isinstance(elements, bytes): + elements = str(elements, "utf-8") + elements = elements.split(",") if isinstance(elements, (list, tuple)): - return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U) + return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U) else: - raise Exception("Unknown type for the pattern: {}".format(type(elements))) + raise Exception(f"Unknown type for the pattern: {type(elements)}") # assume string or string like object @@ -154,7 +159,11 @@ def _html(self, force=False): return self.html def _parse(self, input): - doc, self.encoding = build_doc(input) + if isinstance(input, (_ElementTree, HtmlElement)): + doc = input + self.encoding = 'utf-8' + else: + doc, self.encoding = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.url if base_href: @@ -186,6 +195,10 @@ def title(self): """Returns document title""" return get_title(self._html(True)) + def author(self): + """Returns document author""" + return get_author(self._html(True)) + def short_title(self): """Returns cleaned up document title""" return shorten_title(self._html(True)) @@ -197,12 +210,13 @@ def get_clean_html(self): """ return clean_attributes(tounicode(self.html, method="html")) - def summary(self, html_partial=False): + def summary(self, html_partial=False, keep_all_images=False): """ Given a HTML file, extracts the text of the article. :param html_partial: return only the div of the document, don't wrap in html and body tags. + :param keep_all_images: Keep all images in summary. Warning: It mutates internal DOM representation of the HTML document, so it is better to call other API methods before this one. @@ -231,24 +245,20 @@ def summary(self, html_partial=False): log.info("ruthless removal did not work. ") ruthless = False log.debug( - ( "ended up stripping too much - " "going for a safer _parse" - ) ) # try again continue else: log.debug( - ( "Ruthless and lenient parsing did not work. " "Returning raw html" - ) ) article = self.html.find("body") if article is None: article = self.html - cleaned_article = self.sanitize(article, candidates) + cleaned_article = self.sanitize(article, candidates, keep_all_images) article_length = len(cleaned_article or "") retry_length = self.retry_length @@ -261,11 +271,7 @@ def summary(self, html_partial=False): return cleaned_article except Exception as e: log.exception("error getting summary: ") - if sys.version_info[0] == 2: - from .compat.two import raise_with_traceback - else: - from .compat.three import raise_with_traceback - raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e)) + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -327,7 +333,7 @@ def select_best_candidate(self, candidates): ) for candidate in sorted_candidates[:5]: elem = candidate["elem"] - log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem))) + log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem))) best_candidate = sorted_candidates[0] return best_candidate @@ -443,7 +449,7 @@ def score_node(self, elem): def remove_unlikely_candidates(self): for elem in self.html.findall(".//*"): - s = "%s %s" % (elem.get("class", ""), elem.get("id", "")) + s = "{} {}".format(elem.get("class", ""), elem.get("id", "")) if len(s) < 2: continue if ( @@ -463,7 +469,8 @@ def transform_misused_divs_into_paragraphs(self): # This results in incorrect results in case there is an # buried within an for example if not REGEXES["divToPElementsRe"].search( - str_(b"".join(map(tostring_, list(elem)))) + str(b"".join(tostring(s, encoding='utf-8') for s in elem)) + # str(b"".join(map(tostring_, list(elem)))) ): # log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" @@ -490,15 +497,13 @@ def transform_misused_divs_into_paragraphs(self): def tags(self, node, *tag_names): for tag_name in tag_names: - for e in node.findall(".//%s" % tag_name): - yield e + yield from node.findall(".//%s" % tag_name) def reverse_tags(self, node, *tag_names): for tag_name in tag_names: - for e in reversed(node.findall(".//%s" % tag_name)): - yield e + yield from reversed(node.findall(".//%s" % tag_name)) - def sanitize(self, node, candidates): + def sanitize(self, node, candidates, keep_all_images=False): MIN_LEN = self.min_text_length for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: @@ -559,8 +564,8 @@ def sanitize(self, node, candidates): to_remove = False reason = "" - # if el.tag == 'div' and counts["img"] >= 1: - # continue + if keep_all_images and el.tag == 'div' and counts["img"] >= 1: + continue if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: reason = "too many images (%s)" % counts["img"] to_remove = True @@ -583,13 +588,13 @@ def sanitize(self, node, candidates): ) to_remove = True elif weight < 25 and link_density > 0.2: - reason = "too many links %.3f for its weight %s" % ( + reason = "too many links {:.3f} for its weight {}".format( link_density, weight, ) to_remove = True elif weight >= 25 and link_density > 0.5: - reason = "too many links %.3f for its weight %s" % ( + reason = "too many links {:.3f} for its weight {}".format( link_density, weight, ) @@ -715,18 +720,10 @@ def main(): file = None if options.url: headers = {"User-Agent": "Mozilla/5.0"} - if sys.version_info[0] == 3: - import urllib.request, urllib.parse, urllib.error - - request = urllib.request.Request(options.url, None, headers) - file = urllib.request.urlopen(request) - else: - import urllib2 - - request = urllib2.Request(options.url, None, headers) - file = urllib2.urlopen(request) + request = urllib.request.Request(options.url, None, headers) + file = urllib.request.urlopen(request) else: - file = open(args[0], "rt") + file = open(args[0]) try: doc = Document( file.read(), @@ -740,14 +737,8 @@ def main(): result = "

" + doc.short_title() + "


" + doc.summary() open_in_browser(result) else: - enc = ( - sys.__stdout__.encoding or "utf-8" - ) # XXX: this hack could not always work, better to set PYTHONIOENCODING result = "Title:" + doc.short_title() + "\n" + doc.summary() - if sys.version_info[0] == 3: - print(result) - else: - print(result.encode(enc, "replace")) + print(result) finally: file.close() diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..996bbfc0 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +nose +twine +flake8 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d6e1198b..00000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ --e . diff --git a/setup.py b/setup.py index adb6a239..a88e8185 100755 --- a/setup.py +++ b/setup.py @@ -1,29 +1,16 @@ #!/usr/bin/env python -from __future__ import print_function import codecs import os import re from setuptools import setup -import sys -lxml_requirement = "lxml" -if sys.platform == "darwin": - import platform - - mac_ver = platform.mac_ver()[0] - mac_ver_no = int(mac_ver.split(".")[1]) - if mac_ver_no < 9: - print("Using lxml<2.4") - lxml_requirement = "lxml<2.4" - -test_deps = [ - # Test timeouts - "timeout_decorator", +speed_deps = [ + "cchardet", ] extras = { - "test": test_deps, + 'speed': speed_deps, } # Adapted from https://github.com/pypa/pip/blob/master/setup.py @@ -48,14 +35,19 @@ def find_version(*file_paths): version=find_version("readability", "__init__.py"), author="Yuri Baburov", author_email="burchik@gmail.com", - description="fast html to text parser (article readability tool) with python3 support", + description="fast html to text parser (article readability tool) with python 3 support", test_suite="tests.test_article_only", - long_description=open("README.rst").read(), + long_description=open("README.md").read(), + long_description_content_type="text/markdown", license="Apache License 2.0", url="http://github.com/buriy/python-readability", - packages=["readability", "readability.compat"], - install_requires=["chardet", lxml_requirement, "cssselect"], - tests_require=test_deps, + packages=["readability"], + install_requires=[ + "chardet", + "lxml[html_clean]", + "lxml-html-clean; python_version < '3.11'", + "cssselect" + ], extras_require=extras, classifiers=[ "Environment :: Web Environment", @@ -66,12 +58,13 @@ def find_version(*file_paths): "Topic :: Internet", "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: PyPy", ], ) diff --git a/tests/samples/summary-keep-all-images.sample.html b/tests/samples/summary-keep-all-images.sample.html new file mode 100644 index 00000000..127683fc --- /dev/null +++ b/tests/samples/summary-keep-all-images.sample.html @@ -0,0 +1,29 @@ + + + + +

+ + H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline + +

+

+ + Text Text Text Text Text Text Text Text Text Text + +

+
+

+ + Text Text Text Text Text Text Text Text Text Text + +

+ + \ No newline at end of file diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 51dfaea1..fe322121 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -1,8 +1,33 @@ import os +import time import unittest from readability import Document -import timeout_decorator +from functools import wraps + + +class TimeoutException(Exception): + """Exception raised when a function exceeds its time limit.""" + pass + + +def timeout(seconds): + """Decorator to enforce a timeout on function execution.""" + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + elapsed_time = end_time - start_time + if elapsed_time > seconds: + raise TimeoutException( + f"Function '{func.__name__}' exceeded time limit of {seconds} seconds " + f"with an execution time of {elapsed_time:.4f} seconds" + ) + return result + return wrapper + return decorator SAMPLES = os.path.join(os.path.dirname(__file__), "samples") @@ -101,7 +126,7 @@ def test_correct_cleanup(self): assert not "aside" in s # Many spaces make some regexes run forever - @timeout_decorator.timeout(seconds=3, use_signals=False) + @timeout(3) def test_many_repeated_spaces(self): long_space = " " * 1000000 sample = "

foo" + long_space + "

" @@ -124,3 +149,85 @@ def test_utf8_kanji(self): sample = load_sample("utf-8-kanji.sample.html") doc = Document(sample) res = doc.summary() + assert 0 < len(res) < 10000 + + def test_author_present(self): + sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html") + doc = Document(sample) + assert 'Alex von Tunzelmann' == doc.author() + + def test_author_absent(self): + sample = load_sample("si-game.sample.html") + doc = Document(sample) + assert '[no-author]' == doc.author() + + def test_keep_images_present(self): + sample = load_sample("summary-keep-all-images.sample.html") + + doc = Document(sample) + + assert " + + 这是标题 + + +
一些无关紧要的内容
+
+

主要文章标题

+

这是主要内容的第一段。

+

これはコンテンツの第2段落です。

+

이것은 콘텐츠의 세 번째 단락입니다.

+

This is the fourth paragraph.

+
+
More irrelevant stuff
+ + + """ + doc = Document(html) + summary = doc.summary() + # Check that the main CJK content is present in the summary + self.assertTrue("这是主要内容的第一段" in summary) + self.assertTrue("これはコンテンツの第2段落です" in summary) + self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary) + # Check that irrelevant content is mostly gone + self.assertFalse("一些无关紧要的内容" in summary) + + def test_shorten_title_delimiter_bug(self): + """Test that shorten_title handles delimiters correctly when the last part is valid. + + This specifically targets a potential bug where 'p1' might be used instead of 'pl'. + """ + html = """ + + + Short Part | これは長いです + + +
Content
+ + + """ + doc = Document(html) + # With the bug, this call might raise NameError: name 'p1' is not defined + # With the fix, it should correctly return the last part. + short_title = doc.short_title() + self.assertEqual(short_title, "これは長いです") diff --git a/tox.ini b/tox.ini index a9ec295d..926fda50 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ [tox] envlist = - py{27,35,36,37,38,py,py3}, doc + py{38,39,310,311,312,313,py3}, doc skip_missing_interpreters = True @@ -14,7 +14,7 @@ deps = pytest doc: sphinx doc: sphinx_rtd_theme - doc: recommonmark + doc: myst-parser # This creates the virtual envs with --site-packages so already packages # that are already installed will be reused. This is especially useful on @@ -30,4 +30,4 @@ commands = [testenv:doc] commands = - python setup.py build_sphinx + sphinx-build -b html doc/source/ build/