diff --git a/readability/encoding.py b/readability/encoding.py index fb4761df..71f9eb67 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,24 +1,25 @@ import re import chardet + +charset_re = re.compile(r']', flags=re.I) +pragma_re = re.compile(r']', flags=re.I) +xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') + + def get_encoding(page): # Regex for XML and HTML Meta charset declaration - charset_re = re.compile(r']', flags=re.I) - pragma_re = re.compile(r']', flags=re.I) - xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') - declared_encodings = (charset_re.findall(page) + pragma_re.findall(page) + xml_re.findall(page)) # Try any declared encodings - if len(declared_encodings) > 0: - for declared_encoding in declared_encodings: - try: - page.decode(custom_decode(declared_encoding)) - return custom_decode(declared_encoding) - except UnicodeDecodeError: - pass + for declared_encoding in reversed(declared_encodings): + try: + page.decode(custom_decode(declared_encoding)) + return custom_decode(declared_encoding) + except UnicodeDecodeError: + pass # Fallback to chardet if declared encodings fail text = re.sub(']*>\s*', ' ', page) diff --git a/readability/htmls.py b/readability/htmls.py index 15eada31..6dbff03c 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,7 +1,6 @@ from cleaners import normalize_spaces, clean_attributes from encoding import get_encoding from lxml.html import tostring -import logging import lxml.html import re, sys diff --git a/readability/readability.py b/readability/readability.py index 9b393d08..70147688 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,7 +1,9 @@ #!/usr/bin/env python + import logging import re import sys +import six from collections import defaultdict from lxml.etree import tostring @@ -105,6 +107,7 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, **opti - url: will allow adjusting links to be absolute - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"] - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"] + - unwanted_tags: strips given tags. Also positive_keywords and negative_keywords could be a regexp. """ self.input = input @@ -113,6 +116,12 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, **opti self.encoding = None self.positive_keywords = compile_pattern(positive_keywords) self.negative_keywords = compile_pattern(negative_keywords) + self.unwanted_tags = set(options.pop('unwanted_tags', [])) + + def strip_tags(self, elt): + for tag in self.unwanted_tags: + for elt in elt.xpath('.//%s' % tag): + elt.getparent().remove(elt) def _html(self, force=False): if force or self.html is None: @@ -182,7 +191,11 @@ def summary(self, html_partial=False): article = self.html.find('body') if article is None: article = self.html + + self.strip_tags(article) cleaned_article = self.sanitize(article, candidates) + assert isinstance(cleaned_article, six.string_types) + article_length = len(cleaned_article or '') retry_length = self.options.get( 'retry_length', @@ -556,6 +569,9 @@ def sanitize(self, node, candidates): pass self.html = node + for elt in self.html.xpath('//p'): + elt.tail = '\n\n' + return self.get_clean_html() diff --git a/setup.py b/setup.py index e7bb5884..993941e5 100755 --- a/setup.py +++ b/setup.py @@ -8,7 +8,6 @@ mac_ver = platform.mac_ver()[0] mac_ver_no = int(mac_ver.split('.')[1]) if mac_ver_no < 9: - print "Using lxml<2.4" lxml_requirement = "lxml<2.4" setup( @@ -23,9 +22,9 @@ url="http://github.com/buriy/python-readability", packages=['readability'], install_requires=[ - "chardet", + "chardet", "cssselect", 'six', lxml_requirement - ], + ], classifiers=[ "Environment :: Web Environment", "Intended Audience :: Developers",