diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..b33811f1 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E501, W503 \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..80224f9e --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +tests/samples/* linguist-vendored diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..23f16106 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 00000000..bdaab28a --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,39 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index 16a2c86e..b532e65e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ *.pyc +__pycache__ *.egg-info -build -dist +/build +/dist /bin /include /lib @@ -12,3 +13,6 @@ nosetests.xml .tox .idea .cache +/.noseids +/.venv +/poetry.lock \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 9a668090..ea56f519 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,17 +1,29 @@ language: python +os: linux +cache: pip -python: - - "3.4" - -env: - - TOX_ENV=py26 - - TOX_ENV=py27 - - TOX_ENV=py33 - - TOX_ENV=py34 +matrix: + include: + - name: "Python 3.8 on Linux" + dist: xenial + python: 3.8 + - name: "Python 3.9 Nightly on Linux" + dist: bionic + python: nightly + - name: "Pypy 3 on Linux" + python: pypy3 + allow_failures: + - python: nightly + - python: pypy3 + - os: osx install: - - travis_retry pip install -U pip wheel tox - - travis_retry pip install -U -r requirements.txt -e . + - if [ $PIP ]; then true; else PIP=pip3; fi + - travis_retry $PIP install -U pip wheel tox-travis pytest-cov codecov + - travis_retry $PIP install -U -r requirements.txt -e ".[test]" script: - - tox -e $TOX_ENV + - tox + +after_success: + - codecov diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile index 0a28f375..9caf08a5 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ # Makefile to help automate tasks WD := $(shell pwd) -PY := .env/bin/python -PIP := .env/bin/pip -PEP8 := .env/bin/pep8 -NOSE := .env/bin/nosetests - +PY := .venv/bin/python +PIP := .venv/bin/pip +PEP8 := .venv/bin/pep8 +NOSE := .venv/bin/nosetests +TWINE := .venv/bin/twine # ########### # Tests rule! @@ -13,25 +13,29 @@ NOSE := .env/bin/nosetests test: venv develop $(NOSE) $(NOSE) --with-id -s tests -$(NOSE): - $(PIP) install nose pep8 coverage +$(NOSE): setup # ####### # INSTALL # ####### .PHONY: all -all: venv develop +all: setup develop + +venv: .venv/bin/python + +setup: venv + $(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true + +.venv/bin/python: + test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv -venv: bin/python -bin/python: - virtualenv .env +.PHONY: clean +clean: + rm -rf .venv -.PHONY: clean_venv -clean_venv: - rm -rf .env +develop: .venv/lib/python*/site-packages/readability-lxml.egg-link -develop: .env/lib/python*/site-packages/readability-lxml.egg-link -.env/lib/python*/site-packages/readability-lxml.egg-link: +.venv/lib/python*/site-packages/readability-lxml.egg-link: $(PY) setup.py develop @@ -41,18 +45,29 @@ develop: .env/lib/python*/site-packages/readability-lxml.egg-link .PHONY: clean_all clean_all: clean_venv +.PHONY: build +build: + poetry build # ########### # Deploy # ########### .PHONY: dist dist: - $(PY) setup.py sdist + $(PY) -m pip install wheel + $(PY) setup.py sdist bdist_wheel + $(TWINE) check dist/* .PHONY: upload upload: - $(PY) setup.py sdist upload + $(TWINE) upload dist/* -.PHONY: version_update -version_update: - $(EDITOR) setup.py +.PHONY: bump +bump: + $(EDITOR) readability/__init__.py + $(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2)) + # fix first occurrence of version in pyproject.toml + sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml + git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py + git tag $(VERSION) + git push --tags diff --git a/README.md b/README.md new file mode 100644 index 00000000..e09a515a --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml) + +# python-readability + +Given an HTML document, extract and clean up the main body text and title. + +This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/). + +## Installation + +It's easy using `pip`, just run: + +```bash +$ pip install readability-lxml +``` + +As an alternative, you may also use conda to install, just run: + +```bash +$ conda install -c conda-forge readability-lxml +``` + +## Usage + +```python +>>> import requests +>>> from readability import Document + +>>> response = requests.get('http://example.com') +>>> doc = Document(response.content) +>>> doc.title() +'Example Domain' + +>>> doc.summary() +"""
\n
\n

Example Domain

\n +

This domain is established to be used for illustrative examples in documents. You may +use this\n domain in examples without prior coordination or asking for permission.

+\n

More information...

\n
+\n\n
""" +``` + +## Change Log +- 0.8.4 Better CJK support, thanks @cdhigh +- 0.8.3.1 Support for python 3.8 - 3.13 +- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev +- 0.8.2 Added article author(s) (thanks @mattblaha) +- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. +- 0.8 Replaced XHTML output with HTML5 output in summary() call. +- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. +- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). +- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords + +## Licensing + +This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license. + +## Thanks to + +- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js) +- Ruby port by starrhorne and iterationlabs +- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk +- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml +- "BR to P" fix from readability.js which improves quality for smaller texts +- Github users contributions. diff --git a/README.rst b/README.rst deleted file mode 100644 index 51eac4af..00000000 --- a/README.rst +++ /dev/null @@ -1,65 +0,0 @@ -.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master - :target: https://travis-ci.org/buriy/python-readability - - -python-readability -================== - -Given a html document, it pulls out the main body text and cleans it up. - -This is a python port of a ruby port of `arc90's readability -project `__. - -Installation ------------- - -It's easy using ``pip``, just run: - -:: - - $ pip install readability-lxml - -Usage ------ - -:: - - >> import requests - >> from readability import Document - >> - >> response = requests.get('http://example.com') - >> doc = Document(response.text) - >> doc.title() - >> 'Example Domain' - -Change Log ----------- - -- 0.3 Added Document.encoding, positive\_keywords and - negative\_keywords -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and - 3.4 -- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - and 3.4 - -Licensing -========= - -This code is under `the Apache License -2.0 `__ license. - -Thanks to ---------- - -- Latest - `readability.js `__ -- Ruby port by starrhorne and iterationlabs -- `Python port `__ by - gfxmonk -- `Decruft - effort `__ - to move to lxml -- "BR to P" fix from readability.js which improves quality for smaller - texts -- Github users contributions. diff --git a/doc/__init__.py b/doc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/doc/source/__init__.py b/doc/source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/doc/source/api.rst b/doc/source/api.rst new file mode 100644 index 00000000..b0e3bbbb --- /dev/null +++ b/doc/source/api.rst @@ -0,0 +1,30 @@ +Reference +========= + +.. automodule:: readability + :members: + :show-inheritance: + +.. automodule:: readability.browser + :members: + :show-inheritance: + +.. automodule:: readability.cleaners + :members: + :show-inheritance: + +.. automodule:: readability.debug + :members: + :show-inheritance: + +.. automodule:: readability.encoding + :members: + :show-inheritance: + +.. automodule:: readability.htmls + :members: + :show-inheritance: + +.. automodule:: readability.readability + :members: + :show-inheritance: diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 00000000..e70cf9b3 --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# +# readability documentation build configuration file, created by +# sphinx-quickstart on Thu Mar 23 16:29:38 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath("../..")) + +import readability + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "myst_parser", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = [".rst", ".md"] + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = "readability" +copyright = "2020, Yuri Baburov" +author = "Yuri Baburov" + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. + +# The short X.Y version. +version = readability.__version__ + +# The full version, including alpha/beta/rc tags. +release = readability.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = [] #'_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = "readabilitydoc" + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [(master_doc, "readability.tex", "Readability Documentation", "Yuri Baburov", "manual")] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "readability", "readability Documentation", [author], 1)] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "readability", + "Readability Documentation", + author, + "readability", + "One line description of project.", + "Miscellaneous", + ) +] + + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), +} diff --git a/doc/source/index.rst b/doc/source/index.rst new file mode 100644 index 00000000..e3bce61d --- /dev/null +++ b/doc/source/index.rst @@ -0,0 +1,13 @@ +.. include:: ../../README.rst + +.. toctree:: + :maxdepth: 2 + + api + +Indices and tables +------------------ + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..44992853 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ +[tool.poetry] +name = "readability-lxml" +version = "0.8.4.1" +description = "fast html to text parser (article readability tool) with python 3 support" +authors = ["Yuri Baburov "] +license = "Apache License 2.0" +readme = "README.md" +packages = [ + { include = "readability" }, +] + +[tool.poetry.dependencies] +python = ">=3.8.2,<3.14" +chardet = "^5.2.0" +cssselect = [ + { version = "~1.2", markers = "python_version < '3.9'" }, + { version = "~1.3", markers = "python_version >= '3.9'" } +] +lxml = {extras = ["html-clean"], version = "^5.4.0"} +lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"} + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/readability/__init__.py b/readability/__init__.py index 8822a512..b36f021d 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1 +1,3 @@ +__version__ = "0.8.4.1" + from .readability import Document diff --git a/readability/browser.py b/readability/browser.py index bcfe61c2..42117a5a 100644 --- a/readability/browser.py +++ b/readability/browser.py @@ -7,14 +7,15 @@ def open_in_browser(html): import os import webbrowser import tempfile - handle, fn = tempfile.mkstemp(suffix='.html') - f = os.fdopen(handle, 'wb') + + handle, fn = tempfile.mkstemp(suffix=".html") + f = os.fdopen(handle, "wb") try: f.write(b"") - f.write(html.encode('utf-8')) + f.write(html.encode("utf-8")) finally: # we leak the file itself here, but we should at least close it f.close() - url = 'file://' + fn.replace(os.path.sep, '/') + url = "file://" + fn.replace(os.path.sep, "/") webbrowser.open(url) return url diff --git a/readability/cleaners.py b/readability/cleaners.py index 5cbab474..e0b07260 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -1,33 +1,55 @@ # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds import re -from lxml.html.clean import Cleaner +try: + from lxml.html.clean import Cleaner +except ImportError: + from lxml_html_clean import Cleaner -bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] +bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"] single_quoted = "'[^']+'" double_quoted = '"[^"]+"' -non_space = '[^ "\'>]+' -htmlstrip = re.compile("<" # open - "([^>]+) " # prefix - "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes - '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value - "([^>]*)" # postfix - ">" # end -, re.I) +non_space = "[^ \"'>]+" +htmlstrip = re.compile( + "<" # open + "([^>]+) " # prefix + "(?:%s) *" % ("|".join(bad_attrs),) + + "= *(?:%s|%s|%s)" # undesirable attributes + % (non_space, single_quoted, double_quoted) + + "([^>]*)" # value # postfix + ">", # end + re.I, +) + def clean_attributes(html): while htmlstrip.search(html): - html = htmlstrip.sub('<\\1\\2>', html) + html = htmlstrip.sub("<\\1\\2>", html) return html + def normalize_spaces(s): if not s: - return '' + return "" """replace any sequence of whitespace characters with a single space""" - return ' '.join(s.split()) + return " ".join(s.split()) + -html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, - style=True, links=True, meta=False, add_nofollow=False, - page_structure=False, processing_instructions=True, embedded=False, - frames=False, forms=False, annoying_tags=False, remove_tags=None, - remove_unknown_tags=False, safe_attrs_only=False) +html_cleaner = Cleaner( + scripts=True, + javascript=True, + comments=True, + style=True, + links=True, + meta=False, + add_nofollow=False, + page_structure=False, + processing_instructions=True, + embedded=False, + frames=False, + forms=False, + annoying_tags=False, + remove_tags=None, + remove_unknown_tags=False, + safe_attrs_only=False, +) diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py deleted file mode 100644 index 4d89b0d2..00000000 --- a/readability/compat/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -This module contains compatibility helpers for Python 2/3 interoperability. - -It mainly exists because their are certain incompatibilities in the Python -syntax that can only be solved by conditionally importing different functions. -""" -import sys -if sys.version_info[0] == 2: - str_ = unicode -elif sys.version_info[0] == 3: - str_ = str diff --git a/readability/compat/three.py b/readability/compat/three.py deleted file mode 100644 index 26351575..00000000 --- a/readability/compat/three.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs).with_traceback(traceback) diff --git a/readability/compat/two.py b/readability/compat/two.py deleted file mode 100644 index 642ecb75..00000000 --- a/readability/compat/two.py +++ /dev/null @@ -1,6 +0,0 @@ -def raise_with_traceback(exc_type, traceback, *args, **kwargs): - """ - Raise a new exception of type `exc_type` with an existing `traceback`. All - additional (keyword-)arguments are forwarded to `exc_type` - """ - raise exc_type(*args, **kwargs), None, traceback diff --git a/readability/debug.py b/readability/debug.py index f14f6827..3bc81974 100644 --- a/readability/debug.py +++ b/readability/debug.py @@ -1,7 +1,7 @@ import re -#FIXME: use with caution, can leak memory +# FIXME: use with caution, can leak memory uids = {} uids_document = None @@ -9,17 +9,17 @@ def describe_node(node): global uids if node is None: - return '' - if not hasattr(node, 'tag'): + return "" + if not hasattr(node, "tag"): return "[%s]" % type(node) name = node.tag - if node.get('id', ''): - name += '#' + node.get('id') - if node.get('class', '').strip(): - name += '.' + '.'.join(node.get('class').split()) - if name[:4] in ['div#', 'div.']: + if node.get("id", ""): + name += "#" + node.get("id") + if node.get("class", "").strip(): + name += "." + ".".join(node.get("class").split()) + if name[:4] in ["div#", "div."]: name = name[3:] - if name in ['tr', 'td', 'div', 'p']: + if name in ["tr", "td", "div", "p"]: uid = uids.get(node) if uid is None: uid = uids[node] = len(uids) + 1 @@ -34,20 +34,18 @@ def describe(node, depth=1): uids = {} uids_document = doc - #return repr(NodeRepr(node)) - parent = '' + # return repr(NodeRepr(node)) + parent = "" if depth and node.getparent() is not None: - parent = describe(node.getparent(), depth=depth - 1) + '>' + parent = describe(node.getparent(), depth=depth - 1) + ">" return parent + describe_node(node) -RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U) +RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U) def text_content(elem, length=40): - content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', '')) + content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", "")) if len(content) < length: return content - return content[:length] + '...' - - + return content[:length] + "..." diff --git a/readability/encoding.py b/readability/encoding.py index cc14320d..08332df0 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,22 +1,25 @@ import re -import chardet -import sys +try: + import cchardet as chardet +except ImportError: + import chardet -RE_CHARSET = re.compile(br']', flags=re.I) -RE_PRAGMA = re.compile(br']', flags=re.I) -RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]') +RE_CHARSET = re.compile(r']', flags=re.I) +RE_PRAGMA = re.compile(r']', flags=re.I) +RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') CHARSETS = { - 'big5': 'big5hkscs', - 'gb2312': 'gb18030', - 'ascii': 'utf-8', - 'maccyrillic': 'cp1251', - 'win1251': 'cp1251', - 'win-1251': 'cp1251', - 'windows-1251': 'cp1251', + "big5": "big5hkscs", + "gb2312": "gb18030", + "ascii": "utf-8", + "maccyrillic": "cp1251", + "win1251": "cp1251", + "win-1251": "cp1251", + "windows-1251": "cp1251", } + def fix_charset(encoding): """Overrides encoding when charset declaration or charset determination is a subset of a larger @@ -27,21 +30,20 @@ def fix_charset(encoding): def get_encoding(page): # Regex for XML and HTML Meta charset declaration - declared_encodings = (RE_CHARSET.findall(page) + - RE_PRAGMA.findall(page) + - RE_XML.findall(page)) + declared_encodings = ( + RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page) + ) # Try any declared encodings for declared_encoding in declared_encodings: try: - if sys.version_info[0] == 3: - # declared_encoding will actually be bytes but .decode() only - # accepts `str` type. Decode blindly with ascii because no one should - # ever use non-ascii characters in the name of an encoding. - declared_encoding = declared_encoding.decode('ascii', 'replace') + # Python3 only + # declared_encoding will actually be bytes but .decode() only + # accepts `str` type. Decode blindly with ascii because no one should + # ever use non-ascii characters in the name of an encoding. + declared_encoding = declared_encoding.decode("ascii", "replace") encoding = fix_charset(declared_encoding) - # Now let's decode the page page.decode(encoding) # It worked! @@ -51,12 +53,12 @@ def get_encoding(page): # Fallback to chardet if declared encodings fail # Remove all HTML tags, and leave only text for chardet - text = re.sub(b'(\s*]*>)+\s*', b' ', page).strip() + text = re.sub(r'(\s*]*>)+\s*', ' ', page).strip() enc = 'utf-8' if len(text) < 10: - return enc # can't guess + return enc # can't guess res = chardet.detect(text) - enc = res['encoding'] or 'utf-8' - #print '->', enc, "%.2f" % res['confidence'] + enc = res["encoding"] or "utf-8" + # print '->', enc, "%.2f" % res['confidence'] enc = fix_charset(enc) return enc diff --git a/readability/htmls.py b/readability/htmls.py index 843f0c0b..d99a9f53 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,39 +1,42 @@ from lxml.html import tostring -import logging import lxml.html -import re, sys +import re from .cleaners import normalize_spaces, clean_attributes from .encoding import get_encoding -from .compat import str_ -utf8_parser = lxml.html.HTMLParser(encoding='utf-8') +utf8_parser = lxml.html.HTMLParser(encoding="utf-8") + def build_doc(page): - if isinstance(page, str_): + if isinstance(page, str): encoding = None decoded_page = page else: - encoding = get_encoding(page) or 'utf-8' - decoded_page = page.decode(encoding, 'replace') - + encoding = get_encoding(page) or "utf-8" + decoded_page = page.decode(encoding, "replace") + # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters - doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser) + doc = lxml.html.document_fromstring( + decoded_page.encode("utf-8", "replace"), parser=utf8_parser + ) return doc, encoding + def js_re(src, pattern, flags, repl): - return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) + return re.compile(pattern, flags).sub(src, repl.replace("$", "\\")) + def normalize_entities(cur_title): entities = { - u'\u2014':'-', - u'\u2013':'-', - u'—': '-', - u'–': '-', - u'\u00A0': ' ', - u'\u00AB': '"', - u'\u00BB': '"', - u'"': '"', + "\u2014": "-", + "\u2013": "-", + "—": "-", + "–": "-", + "\u00A0": " ", + "\u00AB": '"', + "\u00BB": '"', + """: '"', } for c, r in entities.items(): if c in cur_title: @@ -41,36 +44,59 @@ def normalize_entities(cur_title): return cur_title + def norm_title(title): return normalize_entities(normalize_spaces(title)) + def get_title(doc): - title = doc.find('.//title') + title = doc.find(".//title") if title is None or title.text is None or len(title.text) == 0: - return '[no-title]' + return "[no-title]" return norm_title(title.text) + +def get_author(doc): + author = doc.find(".//meta[@name='author']") + if author is None or 'content' not in author.keys() or \ + len(author.get('content')) == 0: + return "[no-author]" + + return author.get('content') + + def add_match(collection, text, orig): text = norm_title(text) if len(text.split()) >= 2 and len(text) >= 15: - if text.replace('"', '') in orig.replace('"', ''): + if text.replace('"', "") in orig.replace('"', ""): collection.add(text) -TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle', - '.news_title', '.title', '.head', '.heading', - '.contentheading', '.small_header_red'] + +TITLE_CSS_HEURISTICS = [ + "#title", + "#head", + "#heading", + ".pageTitle", + ".news_title", + ".title", + ".head", + ".heading", + ".contentheading", + ".small_header_red", +] + def shorten_title(doc): - title = doc.find('.//title') + title = doc.find(".//title") if title is None or title.text is None or len(title.text) == 0: - return '' + return "" title = orig = norm_title(title.text) candidates = set() - for item in ['.//h1', './/h2', './/h3']: + for item in [".//h1", ".//h2", ".//h3"]: for e in list(doc.iterfind(item)): if e.text: add_match(candidates, e.text, orig) @@ -84,41 +110,52 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) + cjk = re.compile('[\u4e00-\u9fff]+') + if candidates: title = sorted(candidates, key=len)[-1] else: - for delimiter in [' | ', ' - ', ' :: ', ' / ']: + for delimiter in [" | ", " - ", " :: ", " / "]: if delimiter in title: parts = orig.split(delimiter) - if len(parts[0].split()) >= 4: - title = parts[0] + p0 = parts[0] + pl = parts[-1] + if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)): + title = p0 break - elif len(parts[-1].split()) >= 4: - title = parts[-1] + elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)): + title = pl break else: - if ': ' in title: - parts = orig.split(': ') - if len(parts[-1].split()) >= 4: - title = parts[-1] + if ": " in title: + p1 = orig.split(": ")[-1] + if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 else: - title = orig.split(': ', 1)[1] + title = orig.split(": ", 1)[1] - if not 15 < len(title) < 150: + if cjk.search(title): + if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100 + return orig + elif not 15 < len(title) < 150: return orig return title + +# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py def get_body(doc): - for elem in doc.xpath('.//script | .//link | .//style'): + for elem in doc.xpath(".//script | .//link | .//style"): elem.drop_tree() # tostring() always return utf-8 encoded string # FIXME: isn't better to use tounicode? - raw_html = str_(tostring(doc.body or doc)) + raw_html = tostring(doc.body or doc) + if isinstance(raw_html, bytes): + raw_html = raw_html.decode() cleaned = clean_attributes(raw_html) try: - #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? + # BeautifulSoup(cleaned) #FIXME do we really need to try loading it? return cleaned - except Exception: #FIXME find the equivalent lxml error - #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) + except Exception: # FIXME find the equivalent lxml error + # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) return raw_html diff --git a/readability/readability.py b/readability/readability.py index 8331e279..c5739056 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,40 +1,54 @@ #!/usr/bin/env python -from __future__ import print_function import logging import re import sys +import urllib.request +import urllib.parse +import urllib.error -from collections import defaultdict from lxml.etree import tostring from lxml.etree import tounicode +from lxml.etree import _ElementTree from lxml.html import document_fromstring from lxml.html import fragment_fromstring +from lxml.html import HtmlElement from .cleaners import clean_attributes from .cleaners import html_cleaner from .htmls import build_doc from .htmls import get_body from .htmls import get_title +from .htmls import get_author from .htmls import shorten_title -from .compat import str_ from .debug import describe, text_content log = logging.getLogger("readability.readability") REGEXES = { - 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), - 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I), - 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), - 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), - 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), - #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), - #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), - #'trimRe': re.compile('^\s+|\s+$/'), - #'normalizeRe': re.compile('\s{2,}/'), - #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), - 'videoRe': re.compile('https?:\/\/(www\.)?(youtube|vimeo)\.com', re.I), - #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, + "unlikelyCandidatesRe": re.compile( + r"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter", + re.I, + ), + "okMaybeItsACandidateRe": re.compile(r"and|article|body|column|main|shadow", re.I), + "positiveRe": re.compile( + r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story", + re.I, + ), + "negativeRe": re.compile( + r"combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget", + re.I, + ), + "divToPElementsRe": re.compile( + r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I + ), + # 'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), + # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), + # 'trimRe': re.compile(r'^\s+|\s+$/'), + # 'normalizeRe': re.compile(r'\s{2,}/'), + # 'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), + "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I), + # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } @@ -46,54 +60,72 @@ def to_int(x): if not x: return None x = x.strip() - if x.endswith('px'): + if x.endswith("px"): return int(x[:-2]) - if x.endswith('em'): + if x.endswith("em"): return int(x[:-2]) * 12 return int(x) def clean(text): - text = re.sub('\s*\n\s*', '\n', text) - text = re.sub('\t|[ \t]{2,}', ' ', text) + # Many spaces make the following regexes run forever + text = re.sub(r"\s{255,}", " " * 255, text) + text = re.sub(r"\s*\n\s*", "\n", text) + text = re.sub(r"\t|[ \t]{2,}", " ", text) return text.strip() def text_length(i): return len(clean(i.text_content() or "")) -regexp_type = type(re.compile('hello, world')) def compile_pattern(elements): if not elements: return None - elif isinstance(elements, (list, tuple)): - return list(elements) - elif isinstance(elements, regexp_type): + elif isinstance(elements, re.Pattern): return elements + elif isinstance(elements, (str, bytes)): + if isinstance(elements, bytes): + elements = str(elements, "utf-8") + elements = elements.split(",") + if isinstance(elements, (list, tuple)): + return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U) else: + raise Exception(f"Unknown type for the pattern: {type(elements)}") # assume string or string like object - elements = elements.split(',') - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) + class Document: """Class to build a etree document out of html.""" - def __init__(self, input, positive_keywords=None, negative_keywords=None, - url=None, min_text_length=25, retry_length=250, xpath=False): + def __init__( + self, + input, + positive_keywords=None, + negative_keywords=None, + url=None, + min_text_length=25, + retry_length=250, + xpath=False, + handle_failures="discard", + ): """Generate the document :param input: string of the html content. - :param positive_keywords: regex or list of patterns in classes and ids - :param negative_keywords: regex or list of patterns in classes and ids + :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids + :param negative_keywords: regex, list or comma-separated string in classes and ids :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts. :param retry_length: Tunable. Set to a lower value for better detection of very small texts. :param xpath: If set to True, adds x="..." attribute to each HTML node, containing xpath path pointing to original document path (allows to reconstruct selected summary in original document). - - Example: + :param handle_failures: Parameter passed to `lxml` for handling failure during exception. + Support options = ["discard", "ignore", None] + + Examples: positive_keywords=["news-item", "block"] + positive_keywords=["news-item, block"] + positive_keywords=re.compile("news|block") negative_keywords=["mysidebar", "related", "ads"] The Document class is not re-enterable. @@ -114,6 +146,7 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, self.min_text_length = min_text_length self.retry_length = retry_length self.xpath = xpath + self.handle_failures = handle_failures def _html(self, force=False): if force or self.html is None: @@ -121,25 +154,37 @@ def _html(self, force=False): if self.xpath: root = self.html.getroottree() for i in self.html.getiterator(): - #print root.getpath(i) - i.attrib['x'] = root.getpath(i) + # print root.getpath(i) + i.attrib["x"] = root.getpath(i) return self.html def _parse(self, input): - doc, self.encoding = build_doc(input) + if isinstance(input, (_ElementTree, HtmlElement)): + doc = input + self.encoding = 'utf-8' + else: + doc, self.encoding = build_doc(input) doc = html_cleaner.clean_html(doc) base_href = self.url if base_href: # trying to guard against bad links like try: # such support is added in lxml 3.3.0 - doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures='discard') - except TypeError: #make_links_absolute() got an unexpected keyword argument 'handle_failures' + doc.make_links_absolute( + base_href, + resolve_base_href=True, + handle_failures=self.handle_failures, + ) + except TypeError: # make_links_absolute() got an unexpected keyword argument 'handle_failures' # then we have lxml < 3.3.0 # please upgrade to lxml >= 3.3.0 if you're failing here! - doc.make_links_absolute(base_href, resolve_base_href=True) + doc.make_links_absolute( + base_href, + resolve_base_href=True, + handle_failures=self.handle_failures, + ) else: - doc.resolve_base_href() + doc.resolve_base_href(handle_failures=self.handle_failures) return doc def content(self): @@ -150,6 +195,10 @@ def title(self): """Returns document title""" return get_title(self._html(True)) + def author(self): + """Returns document author""" + return get_author(self._html(True)) + def short_title(self): """Returns cleaned up document title""" return shorten_title(self._html(True)) @@ -159,14 +208,15 @@ def get_clean_html(self): An internal method, which can be overridden in subclasses, for example, to disable or to improve DOM-to-text conversion in .summary() method """ - return clean_attributes(tounicode(self.html)) + return clean_attributes(tounicode(self.html, method="html")) - def summary(self, html_partial=False): + def summary(self, html_partial=False, keep_all_images=False): """ Given a HTML file, extracts the text of the article. :param html_partial: return only the div of the document, don't wrap - in html and body tags. + in html and body tags. + :param keep_all_images: Keep all images in summary. Warning: It mutates internal DOM representation of the HTML document, so it is better to call other API methods before this one. @@ -175,10 +225,10 @@ def summary(self, html_partial=False): ruthless = True while True: self._html(True) - for i in self.tags(self.html, 'script', 'style'): + for i in self.tags(self.html, "script", "style"): i.drop_tree() - for i in self.tags(self.html, 'body'): - i.set('id', 'readabilityBody') + for i in self.tags(self.html, "body"): + i.set("id", "readabilityBody") if ruthless: self.remove_unlikely_candidates() self.transform_misused_divs_into_paragraphs() @@ -187,27 +237,30 @@ def summary(self, html_partial=False): best_candidate = self.select_best_candidate(candidates) if best_candidate: - article = self.get_article(candidates, best_candidate, - html_partial=html_partial) + article = self.get_article( + candidates, best_candidate, html_partial=html_partial + ) else: if ruthless: log.info("ruthless removal did not work. ") ruthless = False log.debug( - ("ended up stripping too much - " - "going for a safer _parse")) + "ended up stripping too much - " + "going for a safer _parse" + ) # try again continue else: log.debug( - ("Ruthless and lenient parsing did not work. " - "Returning raw html")) - article = self.html.find('body') + "Ruthless and lenient parsing did not work. " + "Returning raw html" + ) + article = self.html.find("body") if article is None: article = self.html - cleaned_article = self.sanitize(article, candidates) + cleaned_article = self.sanitize(article, candidates, keep_all_images) - article_length = len(cleaned_article or '') + article_length = len(cleaned_article or "") retry_length = self.retry_length of_acceptable_length = article_length >= retry_length if ruthless and not of_acceptable_length: @@ -217,26 +270,20 @@ def summary(self, html_partial=False): else: return cleaned_article except Exception as e: - log.exception('error getting summary: ') - if sys.version_info[0] == 2: - from .compat.two import raise_with_traceback - else: - from .compat.three import raise_with_traceback - raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e)) + log.exception("error getting summary: ") + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for # content that might also be related. # Things like preambles, content split by ads that we removed, etc. - sibling_score_threshold = max([ - 10, - best_candidate['content_score'] * 0.2]) + sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2]) # create a new html document with a html->body->div if html_partial: - output = fragment_fromstring('
') + output = fragment_fromstring("
") else: - output = document_fromstring('
') - best_elem = best_candidate['elem'] + output = document_fromstring("
") + best_elem = best_candidate["elem"] parent = best_elem.getparent() siblings = parent.getchildren() if parent is not None else [best_elem] for sibling in siblings: @@ -246,8 +293,10 @@ def get_article(self, candidates, best_candidate, html_partial=False): if sibling is best_elem: append = True sibling_key = sibling # HashableElement(sibling) - if sibling_key in candidates and \ - candidates[sibling_key]['content_score'] >= sibling_score_threshold: + if ( + sibling_key in candidates + and candidates[sibling_key]["content_score"] >= sibling_score_threshold + ): append = True if sibling.tag == "p": @@ -257,9 +306,11 @@ def get_article(self, candidates, best_candidate, html_partial=False): if node_length > 80 and link_density < 0.25: append = True - elif node_length <= 80 \ - and link_density == 0 \ - and re.search('\.( |$)', node_content): + elif ( + node_length <= 80 + and link_density == 0 + and re.search(r"\.( |$)", node_content) + ): append = True if append: @@ -269,7 +320,7 @@ def get_article(self, candidates, best_candidate, html_partial=False): output.append(sibling) else: output.getchildren()[0].getchildren()[0].append(sibling) - #if output is not None: + # if output is not None: # output.append(best_elem) return output @@ -278,15 +329,11 @@ def select_best_candidate(self, candidates): return None sorted_candidates = sorted( - candidates.values(), - key=lambda x: x['content_score'], - reverse=True + candidates.values(), key=lambda x: x["content_score"], reverse=True ) for candidate in sorted_candidates[:5]: - elem = candidate['elem'] - log.info("Top 5 : %6.3f %s" % ( - candidate['content_score'], - describe(elem))) + elem = candidate["elem"] + log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem))) best_candidate = sorted_candidates[0] return best_candidate @@ -295,7 +342,7 @@ def get_link_density(self, elem): link_length = 0 for i in elem.findall(".//a"): link_length += text_length(i) - #if len(elem.findall(".//div") or elem.findall(".//p")): + # if len(elem.findall(".//div") or elem.findall(".//p")): # link_length = link_length total_length = text_length(elem) return float(link_length) / max(total_length, 1) @@ -323,20 +370,19 @@ def score_paragraphs(self): ordered.append(parent_node) if grand_parent_node is not None and grand_parent_node not in candidates: - candidates[grand_parent_node] = self.score_node( - grand_parent_node) + candidates[grand_parent_node] = self.score_node(grand_parent_node) ordered.append(grand_parent_node) content_score = 1 - content_score += len(inner_text.split(',')) + content_score += len(inner_text.split(",")) content_score += min((inner_text_len / 100), 3) - #if elem not in candidates: + # if elem not in candidates: # candidates[elem] = self.score_node(elem) - #WTF? candidates[elem]['content_score'] += content_score - candidates[parent_node]['content_score'] += content_score + # WTF? candidates[elem]['content_score'] += content_score + candidates[parent_node]["content_score"] += content_score if grand_parent_node is not None: - candidates[grand_parent_node]['content_score'] += content_score / 2.0 + candidates[grand_parent_node]["content_score"] += content_score / 2.0 # Scale the final candidates score based on link density. Good content # should have a relatively small link density (5% or less) and be @@ -344,24 +390,23 @@ def score_paragraphs(self): for elem in ordered: candidate = candidates[elem] ld = self.get_link_density(elem) - score = candidate['content_score'] - log.debug("Branch %6.3f %s link density %.3f -> %6.3f" % ( - score, - describe(elem), - ld, - score * (1 - ld))) - candidate['content_score'] *= (1 - ld) + score = candidate["content_score"] + log.debug( + "Branch %6.3f %s link density %.3f -> %6.3f" + % (score, describe(elem), ld, score * (1 - ld)) + ) + candidate["content_score"] *= 1 - ld return candidates def class_weight(self, e): weight = 0 - for feature in [e.get('class', None), e.get('id', None)]: + for feature in [e.get("class", None), e.get("id", None)]: if feature: - if REGEXES['negativeRe'].search(feature): + if REGEXES["negativeRe"].search(feature): weight -= 25 - if REGEXES['positiveRe'].search(feature): + if REGEXES["positiveRe"].search(feature): weight += 25 if self.positive_keywords and self.positive_keywords.search(feature): @@ -370,10 +415,10 @@ def class_weight(self, e): if self.negative_keywords and self.negative_keywords.search(feature): weight -= 25 - if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag): + if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag): weight += 25 - if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag): + if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag): weight -= 25 return weight @@ -381,72 +426,84 @@ def class_weight(self, e): def score_node(self, elem): content_score = self.class_weight(elem) name = elem.tag.lower() - if name == "div": + if name in ["div", "article"]: content_score += 5 elif name in ["pre", "td", "blockquote"]: content_score += 3 - elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]: + elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]: content_score -= 3 - elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]: + elif name in [ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "th", + "header", + "footer", + "nav", + ]: content_score -= 5 - return { - 'content_score': content_score, - 'elem': elem - } + return {"content_score": content_score, "elem": elem} def remove_unlikely_candidates(self): - for elem in self.html.iter(): - s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) + for elem in self.html.findall(".//*"): + s = "{} {}".format(elem.get("class", ""), elem.get("id", "")) if len(s) < 2: continue - if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: + if ( + REGEXES["unlikelyCandidatesRe"].search(s) + and (not REGEXES["okMaybeItsACandidateRe"].search(s)) + and elem.tag not in ["html", "body"] + ): log.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() def transform_misused_divs_into_paragraphs(self): - for elem in self.tags(self.html, 'div'): + for elem in self.tags(self.html, "div"): # transform
s that do not contain other block elements into #

s - #FIXME: The current implementation ignores all descendants that + # FIXME: The current implementation ignores all descendants that # are not direct children of elem # This results in incorrect results in case there is an # buried within an for example - if not REGEXES['divToPElementsRe'].search( - str_(b''.join(map(tostring, list(elem))))): - #log.debug("Altering %s to p" % (describe(elem))) + if not REGEXES["divToPElementsRe"].search( + str(b"".join(tostring(s, encoding='utf-8') for s in elem)) + # str(b"".join(map(tostring_, list(elem)))) + ): + # log.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" - #print "Fixed element "+describe(elem) + # print "Fixed element "+describe(elem) - for elem in self.tags(self.html, 'div'): + for elem in self.tags(self.html, "div"): if elem.text and elem.text.strip(): - p = fragment_fromstring('

') + p = fragment_fromstring("

") p.text = elem.text elem.text = None elem.insert(0, p) - #print "Appended "+tounicode(p)+" to "+describe(elem) + # print "Appended "+tounicode(p)+" to "+describe(elem) for pos, child in reversed(list(enumerate(elem))): if child.tail and child.tail.strip(): - p = fragment_fromstring('

') + p = fragment_fromstring("

") p.text = child.tail child.tail = None elem.insert(pos + 1, p) - #print "Inserted "+tounicode(p)+" to "+describe(elem) - if child.tag == 'br': - #print 'Dropped
at '+describe(elem) + # print "Inserted "+tounicode(p)+" to "+describe(elem) + if child.tag == "br": + # print 'Dropped
at '+describe(elem) child.drop_tree() def tags(self, node, *tag_names): for tag_name in tag_names: - for e in node.findall('.//%s' % tag_name): - yield e + yield from node.findall(".//%s" % tag_name) def reverse_tags(self, node, *tag_names): for tag_name in tag_names: - for e in reversed(node.findall('.//%s' % tag_name)): - yield e + yield from reversed(node.findall(".//%s" % tag_name)) - def sanitize(self, node, candidates): + def sanitize(self, node, candidates, keep_all_images=False): MIN_LEN = self.min_text_length for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: @@ -457,31 +514,35 @@ def sanitize(self, node, candidates): for elem in self.tags(node, "iframe"): if "src" in elem.attrib and REGEXES["videoRe"].search(elem.attrib["src"]): - elem.text = "VIDEO" # ADD content to iframe text node to force proper output + elem.text = "VIDEO" # ADD content to iframe text node to force proper output else: elem.drop_tree() allowed = {} # Conditionally clean s,
    s, and
    s - for el in self.reverse_tags(node, "table", "ul", "div"): + for el in self.reverse_tags( + node, "table", "ul", "div", "aside", "header", "footer", "section" + ): if el in allowed: continue weight = self.class_weight(el) if el in candidates: - content_score = candidates[el]['content_score'] - #print '!',el, '-> %6.3f' % content_score + content_score = candidates[el]["content_score"] + # print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: - log.debug("Removed %s with score %6.3f and weight %-3s" % - (describe(el), content_score, weight, )) + log.debug( + "Removed %s with score %6.3f and weight %-3s" + % (describe(el), content_score, weight,) + ) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} - for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: - counts[kind] = len(el.findall('.//%s' % kind)) + for kind in ["p", "img", "li", "a", "embed", "input"]: + counts[kind] = len(el.findall(".//%s" % kind)) counts["li"] -= 100 counts["input"] -= len(el.findall('.//input[@type="hidden"]')) @@ -491,130 +552,166 @@ def sanitize(self, node, candidates): parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: - content_score = candidates[parent_node]['content_score'] + content_score = candidates[parent_node]["content_score"] else: content_score = 0 - #if parent_node is not None: - #pweight = self.class_weight(parent_node) + content_score - #pname = describe(parent_node) - #else: - #pweight = 0 - #pname = "no parent" + # if parent_node is not None: + # pweight = self.class_weight(parent_node) + content_score + # pname = describe(parent_node) + # else: + # pweight = 0 + # pname = "no parent" to_remove = False reason = "" - #if el.tag == 'div' and counts["img"] >= 1: - # continue - if counts["p"] and counts["img"] > 1+counts["p"]*1.3: + if keep_all_images and el.tag == 'div' and counts["img"] >= 1: + continue + if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: reason = "too many images (%s)" % counts["img"] to_remove = True - elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": + elif counts["li"] > counts["p"] and tag not in ("ol", "ul"): reason = "more
  • s than

    s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x

    s than s" to_remove = True elif content_length < MIN_LEN and counts["img"] == 0: - reason = "too short content length %s without a single image" % content_length + reason = ( + "too short content length %s without a single image" + % content_length + ) to_remove = True elif content_length < MIN_LEN and counts["img"] > 2: - reason = "too short content length %s and too many images" % content_length + reason = ( + "too short content length %s and too many images" + % content_length + ) to_remove = True elif weight < 25 and link_density > 0.2: - reason = "too many links %.3f for its weight %s" % ( - link_density, weight) - to_remove = True + reason = "too many links {:.3f} for its weight {}".format( + link_density, + weight, + ) + to_remove = True elif weight >= 25 and link_density > 0.5: - reason = "too many links %.3f for its weight %s" % ( - link_density, weight) + reason = "too many links {:.3f} for its weight {}".format( + link_density, + weight, + ) to_remove = True - elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: - reason = "s with too short content length, or too many s" + elif (counts["embed"] == 1 and content_length < 75) or counts[ + "embed" + ] > 1: + reason = ( + "s with too short content length, or too many s" + ) to_remove = True elif not content_length: reason = "no content" to_remove = True -# if el.tag == 'div' and counts['img'] >= 1 and to_remove: -# imgs = el.findall('.//img') -# valid_img = False -# log.debug(tounicode(el)) -# for img in imgs: -# -# height = img.get('height') -# text_length = img.get('text_length') -# log.debug ("height %s text_length %s" %(repr(height), repr(text_length))) -# if to_int(height) >= 100 or to_int(text_length) >= 100: -# valid_img = True -# log.debug("valid image" + tounicode(img)) -# break -# if valid_img: -# to_remove = False -# log.debug("Allowing %s" %el.text_content()) -# for desnode in self.tags(el, "table", "ul", "div"): -# allowed[desnode] = True - - #find x non empty preceding and succeeding siblings + # if el.tag == 'div' and counts['img'] >= 1 and to_remove: + # imgs = el.findall('.//img') + # valid_img = False + # log.debug(tounicode(el)) + # for img in imgs: + # + # height = img.get('height') + # text_length = img.get('text_length') + # log.debug ("height %s text_length %s" %(repr(height), repr(text_length))) + # if to_int(height) >= 100 or to_int(text_length) >= 100: + # valid_img = True + # log.debug("valid image" + tounicode(img)) + # break + # if valid_img: + # to_remove = False + # log.debug("Allowing %s" %el.text_content()) + # for desnode in self.tags(el, "table", "ul", "div"): + # allowed[desnode] = True + + # find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): - #log.debug(sib.text_content()) + # log.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: - i =+ 1 + i = +1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): - #log.debug(sib.text_content()) + # log.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: - j =+ 1 + j = +1 siblings.append(sib_content_length) if j == x: break - #log.debug(str_(siblings)) + # log.debug(str_(siblings)) if siblings and sum(siblings) > 1000: to_remove = False log.debug("Allowing %s" % describe(el)) - for desnode in self.tags(el, "table", "ul", "div"): + for desnode in self.tags(el, "table", "ul", "div", "section"): allowed[desnode] = True if to_remove: - log.debug("Removed %6.3f %s with weight %s cause it has %s." % - (content_score, describe(el), weight, reason)) - #print tounicode(el) - #log.debug("pname %s pweight %.3f" %(pname, pweight)) + log.debug( + "Removed %6.3f %s with weight %s cause it has %s." + % (content_score, describe(el), weight, reason) + ) + # print tounicode(el) + # log.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() else: - log.debug("Not removing %s of length %s: %s" % ( - describe(el), content_length, text_content(el))) + log.debug( + "Not removing %s of length %s: %s" + % (describe(el), content_length, text_content(el)) + ) self.html = node return self.get_clean_html() def main(): - VERBOSITY = { - 1: logging.WARNING, - 2: logging.INFO, - 3: logging.DEBUG - } + VERBOSITY = {1: logging.WARNING, 2: logging.INFO, 3: logging.DEBUG} from optparse import OptionParser + parser = OptionParser(usage="%prog: [options] [file]") - parser.add_option('-v', '--verbose', action='count', default=0) - parser.add_option('-b', '--browser', default=None, action='store_true', help="open in browser") - parser.add_option('-l', '--log', default=None, help="save logs into file (appended)") - parser.add_option('-u', '--url', default=None, help="use URL instead of a local file") - parser.add_option('-x', '--xpath', default=None, help="add original xpath") - parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store') - parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store') + parser.add_option("-v", "--verbose", action="count", default=0) + parser.add_option( + "-b", "--browser", default=None, action="store_true", help="open in browser" + ) + parser.add_option( + "-l", "--log", default=None, help="save logs into file (appended)" + ) + parser.add_option( + "-u", "--url", default=None, help="use URL instead of a local file" + ) + parser.add_option("-x", "--xpath", default=None, help="add original xpath") + parser.add_option( + "-p", + "--positive-keywords", + default=None, + help="positive keywords (comma-separated)", + action="store", + ) + parser.add_option( + "-n", + "--negative-keywords", + default=None, + help="negative keywords (comma-separated)", + action="store", + ) (options, args) = parser.parse_args() if options.verbose: - logging.basicConfig(level=VERBOSITY[options.verbose], filename=options.log, - format='%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)') + logging.basicConfig( + level=VERBOSITY[options.verbose], + filename=options.log, + format="%(asctime)s: %(levelname)s: %(message)s (at %(filename)s: %(lineno)d)", + ) if not (len(args) == 1 or options.url): parser.print_help() @@ -622,36 +719,29 @@ def main(): file = None if options.url: - headers = {'User-Agent': 'Mozilla/5.0'} - if sys.version_info[0] == 3: - import urllib.request, urllib.parse, urllib.error - request = urllib.request.Request(options.url, None, headers) - file = urllib.request.urlopen(request) - else: - import urllib2 - request = urllib2.Request(options.url, None, headers) - file = urllib2.urlopen(request) + headers = {"User-Agent": "Mozilla/5.0"} + request = urllib.request.Request(options.url, None, headers) + file = urllib.request.urlopen(request) else: - file = open(args[0], 'rt') + file = open(args[0]) try: - doc = Document(file.read(), + doc = Document( + file.read(), url=options.url, - positive_keywords = options.positive_keywords, - negative_keywords = options.negative_keywords, + positive_keywords=options.positive_keywords, + negative_keywords=options.negative_keywords, ) if options.browser: from .browser import open_in_browser - result = '

    ' + doc.short_title() + '


    ' + doc.summary() + + result = "

    " + doc.short_title() + "


    " + doc.summary() open_in_browser(result) else: - enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING - result = 'Title:' + doc.short_title() + '\n' + doc.summary() - if sys.version_info[0] == 3: - print(result) - else: - print(result.encode(enc, 'replace')) + result = "Title:" + doc.short_title() + "\n" + doc.summary() + print(result) finally: file.close() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..996bbfc0 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +nose +twine +flake8 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d6e1198b..00000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ --e . diff --git a/setup.py b/setup.py index 18a4faeb..a88e8185 100755 --- a/setup.py +++ b/setup.py @@ -1,33 +1,54 @@ #!/usr/bin/env python -from __future__ import print_function -from setuptools import setup, find_packages -import sys - -lxml_requirement = "lxml" -if sys.platform == 'darwin': - import platform - mac_ver = platform.mac_ver()[0] - mac_ver_no = int(mac_ver.split('.')[1]) - if mac_ver_no < 9: - print("Using lxml<2.4") - lxml_requirement = "lxml<2.4" + +import codecs +import os +import re +from setuptools import setup + +speed_deps = [ + "cchardet", +] + +extras = { + 'speed': speed_deps, +} + +# Adapted from https://github.com/pypa/pip/blob/master/setup.py +def find_version(*file_paths): + here = os.path.abspath(os.path.dirname(__file__)) + + # Intentionally *not* adding an encoding option to open, See: + # https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690 + with codecs.open(os.path.join(here, *file_paths), "r") as fp: + version_file = fp.read() + version_match = re.search( + r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M, + ) + if version_match: + return version_match.group(1) + + raise RuntimeError("Unable to find version string.") + setup( name="readability-lxml", - version="0.6.2", + version=find_version("readability", "__init__.py"), author="Yuri Baburov", author_email="burchik@gmail.com", - description="fast html to text parser (article readability tool) with python3 support", - test_suite = "tests.test_article_only", - long_description=open("README.rst").read(), + description="fast html to text parser (article readability tool) with python 3 support", + test_suite="tests.test_article_only", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", license="Apache License 2.0", url="http://github.com/buriy/python-readability", - packages=['readability', 'readability.compat'], + packages=["readability"], install_requires=[ "chardet", - lxml_requirement, + "lxml[html_clean]", + "lxml-html-clean; python_version < '3.11'", "cssselect" - ], + ], + extras_require=extras, classifiers=[ "Environment :: Web Environment", "Intended Audience :: Developers", @@ -37,12 +58,13 @@ "Topic :: Internet", "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: PyPy", ], ) diff --git a/tests/samples/summary-keep-all-images.sample.html b/tests/samples/summary-keep-all-images.sample.html new file mode 100644 index 00000000..127683fc --- /dev/null +++ b/tests/samples/summary-keep-all-images.sample.html @@ -0,0 +1,29 @@ + + + + +

    + + H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline + +

    +

    + + Text Text Text Text Text Text Text Text Text Text + +

    +
    + + + + + +
    +

    + + Text Text Text Text Text Text Text Text Text Text + +

    + + \ No newline at end of file diff --git a/tests/samples/utf-8-kanji.sample.html b/tests/samples/utf-8-kanji.sample.html new file mode 100644 index 00000000..fa1b6527 --- /dev/null +++ b/tests/samples/utf-8-kanji.sample.html @@ -0,0 +1,60 @@ + + + +
    +
    +
    + 草枕 + 夏目漱石 + + + 一 + +  山路を登りながら、こう考えた。 +  智に働けば角が立つ。情に棹させば流される。意地を通せば窮屈だ。とかくに人の世は住みにくい。 +  住みにくさが高じると、安い所へ引き越したくなる。どこへ越しても住みにくいと悟った時、詩が生れて、画が出来る。 +  人の世を作ったものは神でもなければ鬼でもない。やはり向う三軒両隣りにちらちらするただの人である。ただの人が作った人の世が住みにくいからとて、越す国はあるまい。あれば人でなしの国へ行くばかりだ。人でなしの国は人の世よりもなお住みにくかろう。 +  越す事のならぬ世が住みにくければ、住みにくい所をどれほどか、寛容て、束の間の命を、束の間でも住みよくせねばならぬ。ここに詩人という天職が出来て、ここに画家という使命が降る。あらゆる芸術の士は人の世を長閑にし、人の心を豊かにするが故に尊とい。 +  住みにくき世から、住みにくき煩いを引き抜いて、ありがたい世界をまのあたりに写すのが詩である、画である。あるは音楽と彫刻である。こまかに云えば写さないでもよい。ただまのあたりに見れば、そこに詩も生き、歌も湧く。着想を紙に落さぬとも※(「王+膠のつくり」、第3水準1-88-22)鏘の音は胸裏に起る。丹青は画架に向って塗抹せんでも五彩の絢爛は自から心眼に映る。ただおのが住む世を、かく観じ得て、霊台方寸のカメラに澆季溷濁の俗界を清くうららかに収め得れば足る。この故に無声の詩人には一句なく、無色の画家には尺※(「糸+賺のつくり」、第3水準1-90-17)なきも、かく人世を観じ得るの点において、かく煩悩を解脱するの点において、かく清浄界に出入し得るの点において、またこの不同不二の乾坤を建立し得るの点において、我利私慾の覊絆を掃蕩するの点において、――千金の子よりも、万乗の君よりも、あらゆる俗界の寵児よりも幸福である。 +  世に住むこと二十年にして、住むに甲斐ある世と知った。二十五年にして明暗は表裏のごとく、日のあたる所にはきっと影がさすと悟った。三十の今日はこう思うている。――喜びの深きとき憂いよいよ深く、楽みの大いなるほど苦しみも大きい。これを切り放そうとすると身が持てぬ。片づけようとすれば世が立たぬ。金は大事だ、大事なものが殖えれば寝る間も心配だろう。恋はうれしい、嬉しい恋が積もれば、恋をせぬ昔がかえって恋しかろ。閣僚の肩は数百万人の足を支えている。背中には重い天下がおぶさっている。うまい物も食わねば惜しい。少し食えば飽き足らぬ。存分食えばあとが不愉快だ。…… +  余の考がここまで漂流して来た時に、余の右足は突然坐りのわるい角石の端を踏み損くなった。平衡を保つために、すわやと前に飛び出した左足が、仕損じの埋め合せをすると共に、余の腰は具合よく方三尺ほどな岩の上に卸りた。肩にかけた絵の具箱が腋の下から躍り出しただけで、幸いと何の事もなかった。 +  立ち上がる時に向うを見ると、路から左の方にバケツを伏せたような峰が聳えている。杉か檜か分からないが根元から頂きまでことごとく蒼黒い中に、山桜が薄赤くだんだらに棚引いて、続ぎ目が確と見えぬくらい靄が濃い。少し手前に禿山が一つ、群をぬきんでて眉に逼る。禿げた側面は巨人の斧で削り去ったか、鋭どき平面をやけに谷の底に埋めている。天辺に一本見えるのは赤松だろう。枝の間の空さえ判然している。行く手は二丁ほどで切れているが、高い所から赤い毛布が動いて来るのを見ると、登ればあすこへ出るのだろう。路はすこぶる難義だ。 +  土をならすだけならさほど手間も入るまいが、土の中には大きな石がある。土は平らにしても石は平らにならぬ。石は切り砕いても、岩は始末がつかぬ。掘崩した土の上に悠然と峙って、吾らのために道を譲る景色はない。向うで聞かぬ上は乗り越すか、廻らなければならん。巌のない所でさえ歩るきよくはない。左右が高くって、中心が窪んで、まるで一間幅を三角に穿って、その頂点が真中を貫いていると評してもよい。路を行くと云わんより川底を渉ると云う方が適当だ。固より急ぐ旅でないから、ぶらぶらと七曲りへかかる。 +  たちまち足の下で雲雀の声がし出した。谷を見下したが、どこで鳴いてるか影も形も見えぬ。ただ声だけが明らかに聞える。せっせと忙しく、絶間なく鳴いている。方幾里の空気が一面に蚤に刺されていたたまれないような気がする。あの鳥の鳴く音には瞬時の余裕もない。のどかな春の日を鳴き尽くし、鳴きあかし、また鳴き暮らさなければ気が済まんと見える。その上どこまでも登って行く、いつまでも登って行く。雲雀はきっと雲の中で死ぬに相違ない。登り詰めた揚句は、流れて雲に入って、漂うているうちに形は消えてなくなって、ただ声だけが空の裡に残るのかも知れない。 +  巌角を鋭どく廻って、按摩なら真逆様に落つるところを、際どく右へ切れて、横に見下すと、菜の花が一面に見える。雲雀はあすこへ落ちるのかと思った。いいや、あの黄金の原から飛び上がってくるのかと思った。次には落ちる雲雀と、上る雲雀が十文字にすれ違うのかと思った。最後に、落ちる時も、上る時も、また十文字に擦れ違うときにも元気よく鳴きつづけるだろうと思った。 +  春は眠くなる。猫は鼠を捕る事を忘れ、人間は借金のある事を忘れる。時には自分の魂の居所さえ忘れて正体なくなる。ただ菜の花を遠く望んだときに眼が醒める。雲雀の声を聞いたときに魂のありかが判然する。雲雀の鳴くのは口で鳴くのではない、魂全体が鳴くのだ。魂の活動が声にあらわれたもののうちで、あれほど元気のあるものはない。ああ愉快だ。こう思って、こう愉快になるのが詩である。 +  たちまちシェレーの雲雀の詩を思い出して、口のうちで覚えたところだけ暗誦して見たが、覚えているところは二三句しかなかった。その二三句のなかにこんなのがある。 +   We look before and after +     And pine for what is not: +   Our sincerest laughter +     With some pain is fraught; + Our sweetest songs are those that tell of saddest thought. + 「前をみては、後えを見ては、物欲しと、あこがるるかなわれ。腹からの、笑といえど、苦しみの、そこにあるべし。うつくしき、極みの歌に、悲しさの、極みの想、籠るとぞ知れ」 +  なるほどいくら詩人が幸福でも、あの雲雀のように思い切って、一心不乱に、前後を忘却して、わが喜びを歌う訳には行くまい。西洋の詩は無論の事、支那の詩にも、よく万斛の愁などと云う字がある。詩人だから万斛で素人なら一合で済むかも知れぬ。して見ると詩人は常の人よりも苦労性で、凡骨の倍以上に神経が鋭敏なのかも知れん。超俗の喜びもあろうが、無量の悲も多かろう。そんならば詩人になるのも考え物だ。 +  しばらくは路が平で、右は雑木山、左は菜の花の見つづけである。足の下に時々蒲公英を踏みつける。鋸のような葉が遠慮なく四方へのして真中に黄色な珠を擁護している。菜の花に気をとられて、踏みつけたあとで、気の毒な事をしたと、振り向いて見ると、黄色な珠は依然として鋸のなかに鎮座している。呑気なものだ。また考えをつづける。 +  詩人に憂はつきものかも知れないが、あの雲雀を聞く心持になれば微塵の苦もない。菜の花を見ても、ただうれしくて胸が躍るばかりだ。蒲公英もその通り、桜も――桜はいつか見えなくなった。こう山の中へ来て自然の景物に接すれば、見るものも聞くものも面白い。面白いだけで別段の苦しみも起らぬ。起るとすれば足が草臥れて、旨いものが食べられぬくらいの事だろう。 +  しかし苦しみのないのはなぜだろう。ただこの景色を一幅の画として観、一巻の詩として読むからである。画であり詩である以上は地面を貰って、開拓する気にもならねば、鉄道をかけて一儲けする了見も起らぬ。ただこの景色が――腹の足しにもならぬ、月給の補いにもならぬこの景色が景色としてのみ、余が心を楽ませつつあるから苦労も心配も伴わぬのだろう。自然の力はここにおいて尊とい。吾人の性情を瞬刻に陶冶して醇乎として醇なる詩境に入らしむるのは自然である。 +  恋はうつくしかろ、孝もうつくしかろ、忠君愛国も結構だろう。しかし自身がその局に当れば利害の旋風に捲き込まれて、うつくしき事にも、結構な事にも、目は眩んでしまう。したがってどこに詩があるか自身には解しかねる。 +  これがわかるためには、わかるだけの余裕のある第三者の地位に立たねばならぬ。三者の地位に立てばこそ芝居は観て面白い。小説も見て面白い。芝居を見て面白い人も、小説を読んで面白い人も、自己の利害は棚へ上げている。見たり読んだりする間だけは詩人である。 +  それすら、普通の芝居や小説では人情を免かれぬ。苦しんだり、怒ったり、騒いだり、泣いたりする。見るものもいつかその中に同化して苦しんだり、怒ったり、騒いだり、泣いたりする。取柄は利慾が交らぬと云う点に存するかも知れぬが、交らぬだけにその他の情緒は常よりは余計に活動するだろう。それが嫌だ。 +  苦しんだり、怒ったり、騒いだり、泣いたりは人の世につきものだ。余も三十年の間それを仕通して、飽々した。飽き飽きした上に芝居や小説で同じ刺激を繰り返しては大変だ。余が欲する詩はそんな世間的の人情を鼓舞するようなものではない。俗念を放棄して、しばらくでも塵界を離れた心持ちになれる詩である。いくら傑作でも人情を離れた芝居はない、理非を絶した小説は少かろう。どこまでも世間を出る事が出来ぬのが彼らの特色である。ことに西洋の詩になると、人事が根本になるからいわゆる詩歌の純粋なるものもこの境を解脱する事を知らぬ。どこまでも同情だとか、愛だとか、正義だとか、自由だとか、浮世の勧工場にあるものだけで用を弁じている。いくら詩的になっても地面の上を馳けてあるいて、銭の勘定を忘れるひまがない。シェレーが雲雀を聞いて嘆息したのも無理はない。 +  うれしい事に東洋の詩歌はそこを解脱したのがある。採菊東籬下、悠然見南山。ただそれぎりの裏に暑苦しい世の中をまるで忘れた光景が出てくる。垣の向うに隣りの娘が覗いてる訳でもなければ、南山に親友が奉職している次第でもない。超然と出世間的に利害損得の汗を流し去った心持ちになれる。独坐幽篁裏、弾琴復長嘯、深林人不知、明月来相照。ただ二十字のうちに優に別乾坤を建立している。この乾坤の功徳は「不如帰」や「金色夜叉」の功徳ではない。汽船、汽車、権利、義務、道徳、礼義で疲れ果てた後に、すべてを忘却してぐっすり寝込むような功徳である。 +  二十世紀に睡眠が必要ならば、二十世紀にこの出世間的の詩味は大切である。惜しい事に今の詩を作る人も、詩を読む人もみんな、西洋人にかぶれているから、わざわざ呑気な扁舟を泛べてこの桃源に溯るものはないようだ。余は固より詩人を職業にしておらんから、王維や淵明の境界を今の世に布教して広げようと云う心掛も何もない。ただ自分にはこう云う感興が演芸会よりも舞踏会よりも薬になるように思われる。ファウストよりも、ハムレットよりもありがたく考えられる。こうやって、ただ一人絵の具箱と三脚几を担いで春の山路をのそのそあるくのも全くこれがためである。淵明、王維の詩境を直接に自然から吸収して、すこしの間でも非人情の天地に逍遥したいからの願。一つの酔興だ。 +  もちろん人間の一分子だから、いくら好きでも、非人情はそう長く続く訳には行かぬ。淵明だって年が年中南山を見詰めていたのでもあるまいし、王維も好んで竹藪の中に蚊帳を釣らずに寝た男でもなかろう。やはり余った菊は花屋へ売りこかして、生えた筍は八百屋へ払い下げたものと思う。こう云う余もその通り。いくら雲雀と菜の花が気に入ったって、山のなかへ野宿するほど非人情が募ってはおらん。こんな所でも人間に逢う。じんじん端折りの頬冠りや、赤い腰巻の姉さんや、時には人間より顔の長い馬にまで逢う。百万本の檜に取り囲まれて、海面を抜く何百尺かの空気を呑んだり吐いたりしても、人の臭いはなかなか取れない。それどころか、山を越えて落ちつく先の、今宵の宿は那古井の温泉場だ。 +  ただ、物は見様でどうでもなる。レオナルド・ダ・ヴィンチが弟子に告げた言に、あの鐘の音を聞け、鐘は一つだが、音はどうとも聞かれるとある。一人の男、一人の女も見様次第でいかようとも見立てがつく。どうせ非人情をしに出掛けた旅だから、そのつもりで人間を見たら、浮世小路の何軒目に狭苦しく暮した時とは違うだろう。よし全く人情を離れる事が出来んでも、せめて御能拝見の時くらいは淡い心持ちにはなれそうなものだ。能にも人情はある。七騎落でも、墨田川でも泣かぬとは保証が出来ん。しかしあれは情三分芸七分で見せるわざだ。我らが能から享けるありがた味は下界の人情をよくそのままに写す手際から出てくるのではない。そのままの上へ芸術という着物を何枚も着せて、世の中にあるまじき悠長な振舞をするからである。 +  しばらくこの旅中に起る出来事と、旅中に出逢う人間を能の仕組と能役者の所作に見立てたらどうだろう。まるで人情を棄てる訳には行くまいが、根が詩的に出来た旅だから、非人情のやりついでに、なるべく節倹してそこまでは漕ぎつけたいものだ。南山や幽篁とは性の違ったものに相違ないし、また雲雀や菜の花といっしょにする事も出来まいが、なるべくこれに近づけて、近づけ得る限りは同じ観察点から人間を視てみたい。芭蕉と云う男は枕元へ馬が尿するのをさえ雅な事と見立てて発句にした。余もこれから逢う人物を――百姓も、町人も、村役場の書記も、爺さんも婆さんも――ことごとく大自然の点景として描き出されたものと仮定して取こなして見よう。もっとも画中の人物と違って、彼らはおのがじし勝手な真似をするだろう。しかし普通の小説家のようにその勝手な真似の根本を探ぐって、心理作用に立ち入ったり、人事葛藤の詮議立てをしては俗になる。動いても構わない。画中の人間が動くと見れば差し支ない。画中の人物はどう動いても平面以外に出られるものではない。平面以外に飛び出して、立方的に働くと思えばこそ、こっちと衝突したり、利害の交渉が起ったりして面倒になる。面倒になればなるほど美的に見ている訳に行かなくなる。これから逢う人間には超然と遠き上から見物する気で、人情の電気がむやみに双方で起らないようにする。そうすれば相手がいくら働いても、こちらの懐には容易に飛び込めない訳だから、つまりは画の前へ立って、画中の人物が画面の中をあちらこちらと騒ぎ廻るのを見るのと同じ訳になる。間三尺も隔てていれば落ちついて見られる。あぶな気なしに見られる。言を換えて云えば、利害に気を奪われないから、全力を挙げて彼らの動作を芸術の方面から観察する事が出来る。余念もなく美か美でないかと鑒識する事が出来る。 +  ここまで決心をした時、空があやしくなって来た。煮え切れない雲が、頭の上へ靠垂れ懸っていたと思ったが、いつのまにか、崩れ出して、四方はただ雲の海かと怪しまれる中から、しとしとと春の雨が降り出した。菜の花は疾くに通り過して、今は山と山の間を行くのだが、雨の糸が濃かでほとんど霧を欺くくらいだから、隔たりはどれほどかわからぬ。時々風が来て、高い雲を吹き払うとき、薄黒い山の背が右手に見える事がある。何でも谷一つ隔てて向うが脈の走っている所らしい。左はすぐ山の裾と見える。深く罩める雨の奥から松らしいものが、ちょくちょく顔を出す。出すかと思うと、隠れる。雨が動くのか、木が動くのか、夢が動くのか、何となく不思議な心持ちだ。 +  路は存外広くなって、かつ平だから、あるくに骨は折れんが、雨具の用意がないので急ぐ。帽子から雨垂れがぽたりぽたりと落つる頃、五六間先きから、鈴の音がして、黒い中から、馬子がふうとあらわれた。 + 「ここらに休む所はないかね」 + 「もう十五丁行くと茶屋がありますよ。だいぶ濡れたね」 +  まだ十五丁かと、振り向いているうちに、馬子の姿は影画のように雨につつまれて、またふうと消えた。 +  糠のように見えた粒は次第に太く長くなって、今は一筋ごとに風に捲かれる様までが目に入る。羽織はとくに濡れ尽して肌着に浸み込んだ水が、身体の温度で生暖く感ぜられる。気持がわるいから、帽を傾けて、すたすた歩行く。 +  茫々たる薄墨色の世界を、幾条の銀箭が斜めに走るなかを、ひたぶるに濡れて行くわれを、われならぬ人の姿と思えば、詩にもなる、句にも咏まれる。有体なる己れを忘れ尽して純客観に眼をつくる時、始めてわれは画中の人物として、自然の景物と美しき調和を保つ。ただ降る雨の心苦しくて、踏む足の疲れたるを気に掛ける瞬間に、われはすでに詩中の人にもあらず、画裡の人にもあらず。依然として市井の一豎子に過ぎぬ。雲煙飛動の趣も眼に入らぬ。落花啼鳥の情けも心に浮ばぬ。蕭々として独り春山を行く吾の、いかに美しきかはなおさらに解せぬ。初めは帽を傾けて歩行た。後にはただ足の甲のみを見詰めてあるいた。終りには肩をすぼめて、恐る恐る歩行た。雨は満目の樹梢を揺かして四方より孤客に逼る。非人情がちと強過ぎたようだ。 +
    +
    +
    + + + diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 882d346d..fe322121 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -1,15 +1,43 @@ import os +import time import unittest from readability import Document +from functools import wraps -SAMPLES = os.path.join(os.path.dirname(__file__), 'samples') +class TimeoutException(Exception): + """Exception raised when a function exceeds its time limit.""" + pass + + +def timeout(seconds): + """Decorator to enforce a timeout on function execution.""" + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + elapsed_time = end_time - start_time + if elapsed_time > seconds: + raise TimeoutException( + f"Function '{func.__name__}' exceeded time limit of {seconds} seconds " + f"with an execution time of {elapsed_time:.4f} seconds" + ) + return result + return wrapper + return decorator + + +SAMPLES = os.path.join(os.path.dirname(__file__), "samples") def load_sample(filename): """Helper to get the content out of the sample files""" - return open(os.path.join(SAMPLES, filename)).read() + with open(os.path.join(SAMPLES, filename)) as f: + html = f.read() + return html class TestArticleOnly(unittest.TestCase): @@ -23,30 +51,34 @@ class TestArticleOnly(unittest.TestCase): def test_si_sample(self): """Using the si sample, load article with only opening body element""" - sample = load_sample('si-game.sample.html') + sample = load_sample("si-game.sample.html") doc = Document( sample, - url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') + url="http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html", + ) res = doc.summary() - self.assertEqual('
    ' - ' ' - '

    1234567890123456789012345

    ' - ' ' - '' + " " + "

    1234567890123456789012345

    " + " " + "" ) doc = Document(sample) doc.summary() + + def test_correct_cleanup(self): + sample = """ + + +
    test section
    +
    +

    Lot of text here.

    + +

    More text is written here, and contains punctuation and dots.

    +
    +