diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..b33811f1
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501, W503
\ No newline at end of file
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..80224f9e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+tests/samples/* linguist-vendored
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 00000000..23f16106
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,40 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+ push:
+ branches: [ "master" ]
+ pull_request:
+ branches: [ "master" ]
+
+jobs:
+ build:
+
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v3
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install flake8 pytest
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+ - name: Lint with flake8
+ run: |
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Test with pytest
+ run: |
+ pytest
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
new file mode 100644
index 00000000..bdaab28a
--- /dev/null
+++ b/.github/workflows/python-publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+ release:
+ types: [published]
+
+permissions:
+ contents: read
+
+jobs:
+ deploy:
+
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up Python
+ uses: actions/setup-python@v3
+ with:
+ python-version: '3.x'
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install build
+ - name: Build package
+ run: python -m build
+ - name: Publish package
+ uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+ with:
+ user: __token__
+ password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 16a2c86e..b532e65e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,8 @@
*.pyc
+__pycache__
*.egg-info
-build
-dist
+/build
+/dist
/bin
/include
/lib
@@ -12,3 +13,6 @@ nosetests.xml
.tox
.idea
.cache
+/.noseids
+/.venv
+/poetry.lock
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index 21e1ce11..ea56f519 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,47 +4,16 @@ cache: pip
matrix:
include:
- - name: "Python 2.7 on Linux"
- python: 2.7
- env: PIP=pip
- - name: "Python 3.5 on Linux"
- python: 3.5
- - name: "Python 3.6 on Linux"
- python: 3.6
- - name: "Python 3.7 on Linux"
- python: 3.7
- name: "Python 3.8 on Linux"
dist: xenial
python: 3.8
- name: "Python 3.9 Nightly on Linux"
dist: bionic
python: nightly
- - name: "Pypy on Linux"
- python: pypy
- env: PIP=pip
- name: "Pypy 3 on Linux"
python: pypy3
- - name: "Python 3.7 on older macOS"
- os: osx
- osx_image: xcode9.4
- language: shell
- env: TOXENV=py37
- before_install:
- - sw_vers
- - python3 --version
- - pip3 --version
- - name: "Python 3.7 on macOS"
- os: osx
- osx_image: xcode11
- language: shell
- env: TOXENV=py37
- before_install:
- - sw_vers
- - python3 --version
- - pip3 --version
allow_failures:
- python: nightly
- - python: pypy
- python: pypy3
- os: osx
diff --git a/Makefile b/Makefile
index 81a14523..9caf08a5 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@ PY := .venv/bin/python
PIP := .venv/bin/pip
PEP8 := .venv/bin/pep8
NOSE := .venv/bin/nosetests
+TWINE := .venv/bin/twine
# ###########
# Tests rule!
@@ -12,22 +13,24 @@ NOSE := .venv/bin/nosetests
test: venv develop $(NOSE)
$(NOSE) --with-id -s tests
-$(NOSE):
- $(PIP) install nose pep8 coverage
+$(NOSE): setup
# #######
# INSTALL
# #######
.PHONY: all
-all: venv develop
+all: setup develop
venv: .venv/bin/python
+setup: venv
+ $(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true
+
.venv/bin/python:
- virtualenv .venv
+ test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv
-.PHONY: clean_venv
-clean_venv:
+.PHONY: clean
+clean:
rm -rf .venv
develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
@@ -42,18 +45,29 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
.PHONY: clean_all
clean_all: clean_venv
+.PHONY: build
+build:
+ poetry build
# ###########
# Deploy
# ###########
.PHONY: dist
dist:
- $(PY) setup.py sdist
+ $(PY) -m pip install wheel
+ $(PY) setup.py sdist bdist_wheel
+ $(TWINE) check dist/*
.PHONY: upload
upload:
- $(PY) setup.py sdist upload
+ $(TWINE) upload dist/*
-.PHONY: version_update
-version_update:
- $(EDITOR) setup.py
+.PHONY: bump
+bump:
+ $(EDITOR) readability/__init__.py
+ $(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2))
+ # fix first occurrence of version in pyproject.toml
+ sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml
+ git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py
+ git tag $(VERSION)
+ git push --tags
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..e09a515a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+[](https://pypi.python.org/pypi/readability-lxml)
+
+# python-readability
+
+Given an HTML document, extract and clean up the main body text and title.
+
+This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/).
+
+## Installation
+
+It's easy using `pip`, just run:
+
+```bash
+$ pip install readability-lxml
+```
+
+As an alternative, you may also use conda to install, just run:
+
+```bash
+$ conda install -c conda-forge readability-lxml
+```
+
+## Usage
+
+```python
+>>> import requests
+>>> from readability import Document
+
+>>> response = requests.get('http://example.com')
+>>> doc = Document(response.content)
+>>> doc.title()
+'Example Domain'
+
+>>> doc.summary()
+"""
\n
\n
Example Domain
\n
+
This domain is established to be used for illustrative examples in documents. You may
+use this\n domain in examples without prior coordination or asking for permission.
+\n
More information...
\n
+\n\n
"""
+```
+
+## Change Log
+- 0.8.4 Better CJK support, thanks @cdhigh
+- 0.8.3.1 Support for python 3.8 - 3.13
+- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
+- 0.8.2 Added article author(s) (thanks @mattblaha)
+- 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
+- 0.8 Replaced XHTML output with HTML5 output in summary() call.
+- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
+- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
+- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
+- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
+- 0.4 Added Videos loading and allowed more images per paragraph
+- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
+
+## Licensing
+
+This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license.
+
+## Thanks to
+
+- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js)
+- Ruby port by starrhorne and iterationlabs
+- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk
+- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml
+- "BR to P" fix from readability.js which improves quality for smaller texts
+- Github users contributions.
diff --git a/README.rst b/README.rst
deleted file mode 100644
index d4150aca..00000000
--- a/README.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master
- :target: https://travis-ci.org/buriy/python-readability
-
-
-python-readability
-==================
-
-Given a html document, it pulls out the main body text and cleans it up.
-
-This is a python port of a ruby port of `arc90's readability
-project `__.
-
-Installation
-------------
-
-It's easy using ``pip``, just run:
-
-.. code-block:: bash
-
- $ pip install readability-lxml
-
-Usage
------
-
-.. code-block:: python
-
- >>> import requests
- >>> from readability import Document
-
- >>> response = requests.get('http://example.com')
- >>> doc = Document(response.text)
- >>> doc.title()
- 'Example Domain'
-
- >>> doc.summary()
- """\n
\n
Example Domain
\n
-
This domain is established to be used for illustrative examples in documents. You may
- use this\n domain in examples without prior coordination or asking for permission.
- \n
More information...
\n
- \n\n
"""
-
-Change Log
-----------
-
-- 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
-- 0.8 Replaced XHTML output with HTML5 output in summary() call.
-- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
-- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
-- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
-- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
-- 0.4 Added Videos loading and allowed more images per paragraph
-- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
-
-Licensing
---------
-
-This code is under `the Apache License
-2.0 `__ license.
-
-Thanks to
----------
-
-- Latest `readability.js `__
-- Ruby port by starrhorne and iterationlabs
-- `Python port `__ by gfxmonk
-- `Decruft effort ` to move to lxml
-- "BR to P" fix from readability.js which improves quality for smaller texts
-- Github users contributions.
diff --git a/doc/source/conf.py b/doc/source/conf.py
index bb261349..e70cf9b3 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
#
# readability documentation build configuration file, created by
# sphinx-quickstart on Thu Mar 23 16:29:38 2017.
@@ -38,7 +37,7 @@
"sphinx.ext.doctest",
"sphinx.ext.intersphinx",
"sphinx.ext.todo",
- "recommonmark",
+ "myst_parser",
]
# Add any paths that contain templates here, relative to this directory.
@@ -72,7 +71,7 @@
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
-language = None
+language = "en"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..44992853
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,25 @@
+[tool.poetry]
+name = "readability-lxml"
+version = "0.8.4.1"
+description = "fast html to text parser (article readability tool) with python 3 support"
+authors = ["Yuri Baburov "]
+license = "Apache License 2.0"
+readme = "README.md"
+packages = [
+ { include = "readability" },
+]
+
+[tool.poetry.dependencies]
+python = ">=3.8.2,<3.14"
+chardet = "^5.2.0"
+cssselect = [
+ { version = "~1.2", markers = "python_version < '3.9'" },
+ { version = "~1.3", markers = "python_version >= '3.9'" }
+]
+lxml = {extras = ["html-clean"], version = "^5.4.0"}
+lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/readability/__init__.py b/readability/__init__.py
index 32e28e3b..b36f021d 100644
--- a/readability/__init__.py
+++ b/readability/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.8.1"
+__version__ = "0.8.4.1"
from .readability import Document
diff --git a/readability/cleaners.py b/readability/cleaners.py
index 69825c6b..e0b07260 100644
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@@ -1,6 +1,9 @@
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
import re
-from lxml.html.clean import Cleaner
+try:
+ from lxml.html.clean import Cleaner
+except ImportError:
+ from lxml_html_clean import Cleaner
bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
single_quoted = "'[^']+'"
diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py
deleted file mode 100644
index c648633a..00000000
--- a/readability/compat/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-This module contains compatibility helpers for Python 2/3 interoperability.
-
-It mainly exists because their are certain incompatibilities in the Python
-syntax that can only be solved by conditionally importing different functions.
-"""
-import sys
-from lxml.etree import tostring
-
-if sys.version_info[0] == 2:
- bytes_ = str
- str_ = unicode
- def tostring_(s):
- return tostring(s, encoding='utf-8').decode('utf-8')
-
-elif sys.version_info[0] == 3:
- bytes_ = bytes
- str_ = str
- def tostring_(s):
- return tostring(s, encoding='utf-8')
diff --git a/readability/compat/three.py b/readability/compat/three.py
deleted file mode 100644
index 26351575..00000000
--- a/readability/compat/three.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def raise_with_traceback(exc_type, traceback, *args, **kwargs):
- """
- Raise a new exception of type `exc_type` with an existing `traceback`. All
- additional (keyword-)arguments are forwarded to `exc_type`
- """
- raise exc_type(*args, **kwargs).with_traceback(traceback)
diff --git a/readability/compat/two.py b/readability/compat/two.py
deleted file mode 100644
index 642ecb75..00000000
--- a/readability/compat/two.py
+++ /dev/null
@@ -1,6 +0,0 @@
-def raise_with_traceback(exc_type, traceback, *args, **kwargs):
- """
- Raise a new exception of type `exc_type` with an existing `traceback`. All
- additional (keyword-)arguments are forwarded to `exc_type`
- """
- raise exc_type(*args, **kwargs), None, traceback
diff --git a/readability/encoding.py b/readability/encoding.py
index e915866a..08332df0 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -1,11 +1,13 @@
import re
-import chardet
-import sys
+try:
+ import cchardet as chardet
+except ImportError:
+ import chardet
-RE_CHARSET = re.compile(br']', flags=re.I)
-RE_PRAGMA = re.compile(br']', flags=re.I)
-RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+RE_CHARSET = re.compile(r']', flags=re.I)
+RE_PRAGMA = re.compile(r']', flags=re.I)
+RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
CHARSETS = {
"big5": "big5hkscs",
@@ -35,25 +37,24 @@ def get_encoding(page):
# Try any declared encodings
for declared_encoding in declared_encodings:
try:
- if sys.version_info[0] == 3:
- # declared_encoding will actually be bytes but .decode() only
- # accepts `str` type. Decode blindly with ascii because no one should
- # ever use non-ascii characters in the name of an encoding.
- declared_encoding = declared_encoding.decode("ascii", "replace")
+ # Python3 only
+ # declared_encoding will actually be bytes but .decode() only
+ # accepts `str` type. Decode blindly with ascii because no one should
+ # ever use non-ascii characters in the name of an encoding.
+ declared_encoding = declared_encoding.decode("ascii", "replace")
encoding = fix_charset(declared_encoding)
-
# Now let's decode the page
page.decode(encoding)
# It worked!
return encoding
- except (UnicodeDecodeError, LookupError):
+ except UnicodeDecodeError:
pass
# Fallback to chardet if declared encodings fail
# Remove all HTML tags, and leave only text for chardet
- text = re.sub(br"(\s*?[^>]*>)+\s*", b" ", page).strip()
- enc = "utf-8"
+ text = re.sub(r'(\s*?[^>]*>)+\s*', ' ', page).strip()
+ enc = 'utf-8'
if len(text) < 10:
return enc # can't guess
res = chardet.detect(text)
diff --git a/readability/htmls.py b/readability/htmls.py
index 17a75c7d..d99a9f53 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -4,13 +4,12 @@
from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding
-from .compat import str_
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
def build_doc(page):
- if isinstance(page, str_):
+ if isinstance(page, str):
encoding = None
decoded_page = page
else:
@@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl):
def normalize_entities(cur_title):
entities = {
- u"\u2014": "-",
- u"\u2013": "-",
- u"—": "-",
- u"–": "-",
- u"\u00A0": " ",
- u"\u00AB": '"',
- u"\u00BB": '"',
- u""": '"',
+ "\u2014": "-",
+ "\u2013": "-",
+ "—": "-",
+ "–": "-",
+ "\u00A0": " ",
+ "\u00AB": '"',
+ "\u00BB": '"',
+ """: '"',
}
for c, r in entities.items():
if c in cur_title:
@@ -58,6 +57,15 @@ def get_title(doc):
return norm_title(title.text)
+def get_author(doc):
+ author = doc.find(".//meta[@name='author']")
+ if author is None or 'content' not in author.keys() or \
+ len(author.get('content')) == 0:
+ return "[no-author]"
+
+ return author.get('content')
+
+
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
@@ -102,27 +110,34 @@ def shorten_title(doc):
if e.text_content():
add_match(candidates, e.text_content(), orig)
+ cjk = re.compile('[\u4e00-\u9fff]+')
+
if candidates:
title = sorted(candidates, key=len)[-1]
else:
for delimiter in [" | ", " - ", " :: ", " / "]:
if delimiter in title:
parts = orig.split(delimiter)
- if len(parts[0].split()) >= 4:
- title = parts[0]
+ p0 = parts[0]
+ pl = parts[-1]
+ if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
+ title = p0
break
- elif len(parts[-1].split()) >= 4:
- title = parts[-1]
+ elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
+ title = pl
break
else:
if ": " in title:
- parts = orig.split(": ")
- if len(parts[-1].split()) >= 4:
- title = parts[-1]
+ p1 = orig.split(": ")[-1]
+ if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+ title = p1
else:
title = orig.split(": ", 1)[1]
- if not 15 < len(title) < 150:
+ if cjk.search(title):
+ if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100
+ return orig
+ elif not 15 < len(title) < 150:
return orig
return title
@@ -134,7 +149,9 @@ def get_body(doc):
elem.drop_tree()
# tostring() always return utf-8 encoded string
# FIXME: isn't better to use tounicode?
- raw_html = str_(tostring(doc.body or doc))
+ raw_html = tostring(doc.body or doc)
+ if isinstance(raw_html, bytes):
+ raw_html = raw_html.decode()
cleaned = clean_attributes(raw_html)
try:
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
diff --git a/readability/readability.py b/readability/readability.py
index 7a7d9b3d..c5739056 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -1,20 +1,25 @@
#!/usr/bin/env python
-from __future__ import print_function
import logging
import re
import sys
+import urllib.request
+import urllib.parse
+import urllib.error
+from lxml.etree import tostring
from lxml.etree import tounicode
+from lxml.etree import _ElementTree
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
+from lxml.html import HtmlElement
from .cleaners import clean_attributes
from .cleaners import html_cleaner
from .htmls import build_doc
from .htmls import get_body
from .htmls import get_title
+from .htmls import get_author
from .htmls import shorten_title
-from .compat import str_, bytes_, tostring_
from .debug import describe, text_content
@@ -37,11 +42,11 @@
"divToPElementsRe": re.compile(
r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
),
- #'replaceBrsRe': re.compile(r'(
]*>[ \n\r\t]*){2,}',re.I),
- #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
- #'trimRe': re.compile(r'^\s+|\s+$/'),
- #'normalizeRe': re.compile(r'\s{2,}/'),
- #'killBreaksRe': re.compile(r'(
(\s| ?)*){1,}/'),
+ # 'replaceBrsRe': re.compile(r'(
]*>[ \n\r\t]*){2,}',re.I),
+ # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
+ # 'trimRe': re.compile(r'^\s+|\s+$/'),
+ # 'normalizeRe': re.compile(r'\s{2,}/'),
+ # 'killBreaksRe': re.compile(r'(
(\s| ?)*){1,}/'),
"videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
# skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
}
@@ -77,16 +82,16 @@ def text_length(i):
def compile_pattern(elements):
if not elements:
return None
- elif isinstance(elements, re._pattern_type):
+ elif isinstance(elements, re.Pattern):
return elements
- elif isinstance(elements, (str_, bytes_)):
- if isinstance(elements, bytes_):
- elements = str_(elements, "utf-8")
- elements = elements.split(u",")
+ elif isinstance(elements, (str, bytes)):
+ if isinstance(elements, bytes):
+ elements = str(elements, "utf-8")
+ elements = elements.split(",")
if isinstance(elements, (list, tuple)):
- return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U)
+ return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U)
else:
- raise Exception("Unknown type for the pattern: {}".format(type(elements)))
+ raise Exception(f"Unknown type for the pattern: {type(elements)}")
# assume string or string like object
@@ -154,7 +159,11 @@ def _html(self, force=False):
return self.html
def _parse(self, input):
- doc, self.encoding = build_doc(input)
+ if isinstance(input, (_ElementTree, HtmlElement)):
+ doc = input
+ self.encoding = 'utf-8'
+ else:
+ doc, self.encoding = build_doc(input)
doc = html_cleaner.clean_html(doc)
base_href = self.url
if base_href:
@@ -186,6 +195,10 @@ def title(self):
"""Returns document title"""
return get_title(self._html(True))
+ def author(self):
+ """Returns document author"""
+ return get_author(self._html(True))
+
def short_title(self):
"""Returns cleaned up document title"""
return shorten_title(self._html(True))
@@ -197,12 +210,13 @@ def get_clean_html(self):
"""
return clean_attributes(tounicode(self.html, method="html"))
- def summary(self, html_partial=False):
+ def summary(self, html_partial=False, keep_all_images=False):
"""
Given a HTML file, extracts the text of the article.
:param html_partial: return only the div of the document, don't wrap
in html and body tags.
+ :param keep_all_images: Keep all images in summary.
Warning: It mutates internal DOM representation of the HTML document,
so it is better to call other API methods before this one.
@@ -231,24 +245,20 @@ def summary(self, html_partial=False):
log.info("ruthless removal did not work. ")
ruthless = False
log.debug(
- (
"ended up stripping too much - "
"going for a safer _parse"
- )
)
# try again
continue
else:
log.debug(
- (
"Ruthless and lenient parsing did not work. "
"Returning raw html"
- )
)
article = self.html.find("body")
if article is None:
article = self.html
- cleaned_article = self.sanitize(article, candidates)
+ cleaned_article = self.sanitize(article, candidates, keep_all_images)
article_length = len(cleaned_article or "")
retry_length = self.retry_length
@@ -261,11 +271,7 @@ def summary(self, html_partial=False):
return cleaned_article
except Exception as e:
log.exception("error getting summary: ")
- if sys.version_info[0] == 2:
- from .compat.two import raise_with_traceback
- else:
- from .compat.three import raise_with_traceback
- raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e))
+ raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
def get_article(self, candidates, best_candidate, html_partial=False):
# Now that we have the top candidate, look through its siblings for
@@ -327,7 +333,7 @@ def select_best_candidate(self, candidates):
)
for candidate in sorted_candidates[:5]:
elem = candidate["elem"]
- log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem)))
+ log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem)))
best_candidate = sorted_candidates[0]
return best_candidate
@@ -443,7 +449,7 @@ def score_node(self, elem):
def remove_unlikely_candidates(self):
for elem in self.html.findall(".//*"):
- s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
+ s = "{} {}".format(elem.get("class", ""), elem.get("id", ""))
if len(s) < 2:
continue
if (
@@ -463,7 +469,8 @@ def transform_misused_divs_into_paragraphs(self):
# This results in incorrect results in case there is an
# buried within an for example
if not REGEXES["divToPElementsRe"].search(
- str_(b"".join(map(tostring_, list(elem))))
+ str(b"".join(tostring(s, encoding='utf-8') for s in elem))
+ # str(b"".join(map(tostring_, list(elem))))
):
# log.debug("Altering %s to p" % (describe(elem)))
elem.tag = "p"
@@ -490,15 +497,13 @@ def transform_misused_divs_into_paragraphs(self):
def tags(self, node, *tag_names):
for tag_name in tag_names:
- for e in node.findall(".//%s" % tag_name):
- yield e
+ yield from node.findall(".//%s" % tag_name)
def reverse_tags(self, node, *tag_names):
for tag_name in tag_names:
- for e in reversed(node.findall(".//%s" % tag_name)):
- yield e
+ yield from reversed(node.findall(".//%s" % tag_name))
- def sanitize(self, node, candidates):
+ def sanitize(self, node, candidates, keep_all_images=False):
MIN_LEN = self.min_text_length
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
@@ -559,8 +564,8 @@ def sanitize(self, node, candidates):
to_remove = False
reason = ""
- # if el.tag == 'div' and counts["img"] >= 1:
- # continue
+ if keep_all_images and el.tag == 'div' and counts["img"] >= 1:
+ continue
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
reason = "too many images (%s)" % counts["img"]
to_remove = True
@@ -583,13 +588,13 @@ def sanitize(self, node, candidates):
)
to_remove = True
elif weight < 25 and link_density > 0.2:
- reason = "too many links %.3f for its weight %s" % (
+ reason = "too many links {:.3f} for its weight {}".format(
link_density,
weight,
)
to_remove = True
elif weight >= 25 and link_density > 0.5:
- reason = "too many links %.3f for its weight %s" % (
+ reason = "too many links {:.3f} for its weight {}".format(
link_density,
weight,
)
@@ -715,18 +720,10 @@ def main():
file = None
if options.url:
headers = {"User-Agent": "Mozilla/5.0"}
- if sys.version_info[0] == 3:
- import urllib.request, urllib.parse, urllib.error
-
- request = urllib.request.Request(options.url, None, headers)
- file = urllib.request.urlopen(request)
- else:
- import urllib2
-
- request = urllib2.Request(options.url, None, headers)
- file = urllib2.urlopen(request)
+ request = urllib.request.Request(options.url, None, headers)
+ file = urllib.request.urlopen(request)
else:
- file = open(args[0], "rt")
+ file = open(args[0])
try:
doc = Document(
file.read(),
@@ -740,14 +737,8 @@ def main():
result = "" + doc.short_title() + "
" + doc.summary()
open_in_browser(result)
else:
- enc = (
- sys.__stdout__.encoding or "utf-8"
- ) # XXX: this hack could not always work, better to set PYTHONIOENCODING
result = "Title:" + doc.short_title() + "\n" + doc.summary()
- if sys.version_info[0] == 3:
- print(result)
- else:
- print(result.encode(enc, "replace"))
+ print(result)
finally:
file.close()
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 00000000..996bbfc0
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,3 @@
+nose
+twine
+flake8
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d6e1198b..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
--e .
diff --git a/setup.py b/setup.py
index adb6a239..a88e8185 100755
--- a/setup.py
+++ b/setup.py
@@ -1,29 +1,16 @@
#!/usr/bin/env python
-from __future__ import print_function
import codecs
import os
import re
from setuptools import setup
-import sys
-lxml_requirement = "lxml"
-if sys.platform == "darwin":
- import platform
-
- mac_ver = platform.mac_ver()[0]
- mac_ver_no = int(mac_ver.split(".")[1])
- if mac_ver_no < 9:
- print("Using lxml<2.4")
- lxml_requirement = "lxml<2.4"
-
-test_deps = [
- # Test timeouts
- "timeout_decorator",
+speed_deps = [
+ "cchardet",
]
extras = {
- "test": test_deps,
+ 'speed': speed_deps,
}
# Adapted from https://github.com/pypa/pip/blob/master/setup.py
@@ -48,14 +35,19 @@ def find_version(*file_paths):
version=find_version("readability", "__init__.py"),
author="Yuri Baburov",
author_email="burchik@gmail.com",
- description="fast html to text parser (article readability tool) with python3 support",
+ description="fast html to text parser (article readability tool) with python 3 support",
test_suite="tests.test_article_only",
- long_description=open("README.rst").read(),
+ long_description=open("README.md").read(),
+ long_description_content_type="text/markdown",
license="Apache License 2.0",
url="http://github.com/buriy/python-readability",
- packages=["readability", "readability.compat"],
- install_requires=["chardet", lxml_requirement, "cssselect"],
- tests_require=test_deps,
+ packages=["readability"],
+ install_requires=[
+ "chardet",
+ "lxml[html_clean]",
+ "lxml-html-clean; python_version < '3.11'",
+ "cssselect"
+ ],
extras_require=extras,
classifiers=[
"Environment :: Web Environment",
@@ -66,12 +58,13 @@ def find_version(*file_paths):
"Topic :: Internet",
"Topic :: Software Development :: Libraries :: Python Modules",
"Programming Language :: Python",
- "Programming Language :: Python :: 2",
- "Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.5",
- "Programming Language :: Python :: 3.6",
- "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: Implementation :: PyPy",
],
)
diff --git a/tests/samples/summary-keep-all-images.sample.html b/tests/samples/summary-keep-all-images.sample.html
new file mode 100644
index 00000000..127683fc
--- /dev/null
+++ b/tests/samples/summary-keep-all-images.sample.html
@@ -0,0 +1,29 @@
+
+
+
+
+
+
+ H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline
+
+
+
+
+ Text Text Text Text Text Text Text Text Text Text
+
+
+
+
+
+ Text Text Text Text Text Text Text Text Text Text
+
+
+
+
\ No newline at end of file
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
index 51dfaea1..fe322121 100644
--- a/tests/test_article_only.py
+++ b/tests/test_article_only.py
@@ -1,8 +1,33 @@
import os
+import time
import unittest
from readability import Document
-import timeout_decorator
+from functools import wraps
+
+
+class TimeoutException(Exception):
+ """Exception raised when a function exceeds its time limit."""
+ pass
+
+
+def timeout(seconds):
+ """Decorator to enforce a timeout on function execution."""
+ def decorator(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ start_time = time.perf_counter()
+ result = func(*args, **kwargs)
+ end_time = time.perf_counter()
+ elapsed_time = end_time - start_time
+ if elapsed_time > seconds:
+ raise TimeoutException(
+ f"Function '{func.__name__}' exceeded time limit of {seconds} seconds "
+ f"with an execution time of {elapsed_time:.4f} seconds"
+ )
+ return result
+ return wrapper
+ return decorator
SAMPLES = os.path.join(os.path.dirname(__file__), "samples")
@@ -101,7 +126,7 @@ def test_correct_cleanup(self):
assert not "aside" in s
# Many spaces make some regexes run forever
- @timeout_decorator.timeout(seconds=3, use_signals=False)
+ @timeout(3)
def test_many_repeated_spaces(self):
long_space = " " * 1000000
sample = "foo" + long_space + "
"
@@ -124,3 +149,85 @@ def test_utf8_kanji(self):
sample = load_sample("utf-8-kanji.sample.html")
doc = Document(sample)
res = doc.summary()
+ assert 0 < len(res) < 10000
+
+ def test_author_present(self):
+ sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
+ doc = Document(sample)
+ assert 'Alex von Tunzelmann' == doc.author()
+
+ def test_author_absent(self):
+ sample = load_sample("si-game.sample.html")
+ doc = Document(sample)
+ assert '[no-author]' == doc.author()
+
+ def test_keep_images_present(self):
+ sample = load_sample("summary-keep-all-images.sample.html")
+
+ doc = Document(sample)
+
+ assert "
+
+ 这是标题
+
+
+ 一些无关紧要的内容
+
+
主要文章标题
+
这是主要内容的第一段。
+
これはコンテンツの第2段落です。
+
이것은 콘텐츠의 세 번째 단락입니다.
+
This is the fourth paragraph.
+
+ More irrelevant stuff
+
+