From 27159f45b36ab180710b66e617b264a756952526 Mon Sep 17 00:00:00 2001 From: Hugo Date: Sun, 26 Nov 2017 22:14:25 +0200 Subject: [PATCH 01/98] Drop support for EOL Python 2.6 --- .travis.yml | 1 - setup.py | 1 - tox.ini | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9a668090..1e797655 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ python: - "3.4" env: - - TOX_ENV=py26 - TOX_ENV=py27 - TOX_ENV=py33 - TOX_ENV=py34 diff --git a/setup.py b/setup.py index 18a4faeb..0f032ded 100755 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", diff --git a/tox.ini b/tox.ini index 50b4a74d..c0206c08 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py33, py34 +envlist = py27, py33, py34 [testenv] deps=pytest From f74adc6893ad203919337d6fe4ce58a9df034d13 Mon Sep 17 00:00:00 2001 From: Hugo Date: Sun, 26 Nov 2017 22:20:57 +0200 Subject: [PATCH 02/98] Drop support for EOL Python 3.3 --- .travis.yml | 1 - setup.py | 1 - tox.ini | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1e797655..e34557af 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,6 @@ python: env: - TOX_ENV=py27 - - TOX_ENV=py33 - TOX_ENV=py34 install: diff --git a/setup.py b/setup.py index 0f032ded..bcc62096 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,6 @@ "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", ], diff --git a/tox.ini b/tox.ini index c0206c08..7388ac46 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, py33, py34 +envlist = py27, py34 [testenv] deps=pytest From 4172699812cd2a80f48a46fff994344e220b0be8 Mon Sep 17 00:00:00 2001 From: Hugo Date: Sun, 26 Nov 2017 22:23:31 +0200 Subject: [PATCH 03/98] Add Python 3.5 and 3.6 --- .travis.yml | 4 +++- setup.py | 2 ++ tox.ini | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index e34557af..5bdf1d4f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,13 @@ language: python python: - - "3.4" + - "3.6" env: - TOX_ENV=py27 - TOX_ENV=py34 + - TOX_ENV=py35 + - TOX_ENV=py36 install: - travis_retry pip install -U pip wheel tox diff --git a/setup.py b/setup.py index bcc62096..c3d73626 100755 --- a/setup.py +++ b/setup.py @@ -41,6 +41,8 @@ "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", ], ) diff --git a/tox.ini b/tox.ini index 7388ac46..b29c95ed 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, py34 +envlist = py27, py34, py35, py36 [testenv] deps=pytest From f4a04732fd2a3519ec4a3c1f66e669a20f1a2275 Mon Sep 17 00:00:00 2001 From: Hugo Date: Sun, 26 Nov 2017 22:26:41 +0200 Subject: [PATCH 04/98] Workaround for py35 --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 5bdf1d4f..b0ebc5ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,10 @@ env: - TOX_ENV=py35 - TOX_ENV=py36 +before_install: + # work around https://github.com/travis-ci/travis-ci/issues/8363 + - pyenv global system 3.5 + install: - travis_retry pip install -U pip wheel tox - travis_retry pip install -U -r requirements.txt -e . From 537de2b8f6ac79b4a1d9f4d74855b0a186c021db Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Mon, 7 May 2018 12:28:25 +0700 Subject: [PATCH 05/98] Improved remove_unlikely_candidates following an advice from issue #102 --- readability/readability.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readability/readability.py b/readability/readability.py index 8331e279..90fbc138 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -395,7 +395,7 @@ def score_node(self, elem): } def remove_unlikely_candidates(self): - for elem in self.html.iter(): + for elem in self.html.findall('.//*'): s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) if len(s) < 2: continue From 0e50b53d056359fa1020c9279e4f876be90cb484 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Mon, 7 May 2018 12:31:39 +0700 Subject: [PATCH 06/98] Release version 0.7 . Better HTML5 support and an important bugfix. --- .travis.yml | 2 +- Makefile | 22 +++++++++++----------- README.rst | 12 +++++++----- readability/readability.py | 10 +++++----- setup.py | 3 +-- tests/test_article_only.py | 31 +++++++++++++++++++++++++++++++ 6 files changed, 56 insertions(+), 24 deletions(-) diff --git a/.travis.yml b/.travis.yml index b0ebc5ed..b542c481 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ env: before_install: # work around https://github.com/travis-ci/travis-ci/issues/8363 - - pyenv global system 3.5 + - pyenv global system 3.6 install: - travis_retry pip install -U pip wheel tox diff --git a/Makefile b/Makefile index 0a28f375..3daf2d1d 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,9 @@ # Makefile to help automate tasks WD := $(shell pwd) -PY := .env/bin/python -PIP := .env/bin/pip -PEP8 := .env/bin/pep8 -NOSE := .env/bin/nosetests - +PY := .venv/bin/python +PIP := .venv/bin/pip +PEP8 := .venv/bin/pep8 +NOSE := .venv/bin/nosetests # ########### # Tests rule! @@ -22,16 +21,17 @@ $(NOSE): .PHONY: all all: venv develop -venv: bin/python -bin/python: - virtualenv .env +venv: .venv/bin/python + +.venv/bin/python: + virtualenv .venv .PHONY: clean_venv clean_venv: - rm -rf .env + rm -rf .venv -develop: .env/lib/python*/site-packages/readability-lxml.egg-link -.env/lib/python*/site-packages/readability-lxml.egg-link: +develop: .venv/lib/python*/site-packages/readability-lxml.egg-link +.venv/lib/python*/site-packages/readability-lxml.egg-link: $(PY) setup.py develop diff --git a/README.rst b/README.rst index 51eac4af..518c7553 100644 --- a/README.rst +++ b/README.rst @@ -35,13 +35,15 @@ Usage Change Log ---------- -- 0.3 Added Document.encoding, positive\_keywords and - negative\_keywords -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and - 3.4 +- 0.7 Improved HTML5 tags handling. Heuristics were changed for a lot of sites: Fixed an important +bug with stripping unwanted HTML nodes (only first matching node was removed before). - 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and + 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and + negative\_keywords Licensing ========= diff --git a/readability/readability.py b/readability/readability.py index 90fbc138..12f3d959 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -381,13 +381,13 @@ def class_weight(self, e): def score_node(self, elem): content_score = self.class_weight(elem) name = elem.tag.lower() - if name == "div": + if name in ["div", "article"]: content_score += 5 elif name in ["pre", "td", "blockquote"]: content_score += 3 - elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]: + elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]: content_score -= 3 - elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]: + elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]: content_score -= 5 return { 'content_score': content_score, @@ -463,7 +463,7 @@ def sanitize(self, node, candidates): allowed = {} # Conditionally clean s,
    s, and
    s - for el in self.reverse_tags(node, "table", "ul", "div"): + for el in self.reverse_tags(node, "table", "ul", "div", "aside", "header", "footer", "section"): if el in allowed: continue weight = self.class_weight(el) @@ -577,7 +577,7 @@ def sanitize(self, node, candidates): if siblings and sum(siblings) > 1000: to_remove = False log.debug("Allowing %s" % describe(el)) - for desnode in self.tags(el, "table", "ul", "div"): + for desnode in self.tags(el, "table", "ul", "div", "section"): allowed[desnode] = True if to_remove: diff --git a/setup.py b/setup.py index c3d73626..09744b85 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ setup( name="readability-lxml", - version="0.6.2", + version="0.7", author="Yuri Baburov", author_email="burchik@gmail.com", description="fast html to text parser (article readability tool) with python3 support", @@ -43,6 +43,5 @@ "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", - ], ) diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 882d346d..87e623c7 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -61,3 +61,34 @@ def test_best_elem_is_root_and_passing(self): ) doc = Document(sample) doc.summary() + + def test_correct_cleanup(self): + sample = """ + + +
    test section
    +
    +

    Lot of text here.

    + +

    More text is written here, and contains punctuation and dots.

    +
    +
s,