主要文章标题
+这是主要内容的第一段。
+これはコンテンツの第2段落です。
+이것은 콘텐츠의 세 번째 단락입니다.
+This is the fourth paragraph.
+diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..b33811f1 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E501, W503 \ No newline at end of file diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..23f16106 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/.gitignore b/.gitignore index d8961065..b532e65e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ nosetests.xml .idea .cache /.noseids -/.venv \ No newline at end of file +/.venv +/poetry.lock \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 21e1ce11..ea56f519 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,47 +4,16 @@ cache: pip matrix: include: - - name: "Python 2.7 on Linux" - python: 2.7 - env: PIP=pip - - name: "Python 3.5 on Linux" - python: 3.5 - - name: "Python 3.6 on Linux" - python: 3.6 - - name: "Python 3.7 on Linux" - python: 3.7 - name: "Python 3.8 on Linux" dist: xenial python: 3.8 - name: "Python 3.9 Nightly on Linux" dist: bionic python: nightly - - name: "Pypy on Linux" - python: pypy - env: PIP=pip - name: "Pypy 3 on Linux" python: pypy3 - - name: "Python 3.7 on older macOS" - os: osx - osx_image: xcode9.4 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version - - name: "Python 3.7 on macOS" - os: osx - osx_image: xcode11 - language: shell - env: TOXENV=py37 - before_install: - - sw_vers - - python3 --version - - pip3 --version allow_failures: - python: nightly - - python: pypy - python: pypy3 - os: osx diff --git a/Makefile b/Makefile index 012e4b78..9caf08a5 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ PY := .venv/bin/python PIP := .venv/bin/pip PEP8 := .venv/bin/pep8 NOSE := .venv/bin/nosetests -TWINE := twine +TWINE := .venv/bin/twine # ########### # Tests rule! @@ -24,7 +24,7 @@ all: setup develop venv: .venv/bin/python setup: venv - $(PIP) install -r requirements-dev.txt + $(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true .venv/bin/python: test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv @@ -45,11 +45,16 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link .PHONY: clean_all clean_all: clean_venv +.PHONY: build +build: + poetry build + # ########### # Deploy # ########### .PHONY: dist dist: + $(PY) -m pip install wheel $(PY) setup.py sdist bdist_wheel $(TWINE) check dist/* @@ -57,6 +62,12 @@ dist: upload: $(TWINE) upload dist/* -.PHONY: version_update -version_update: - $(EDITOR) setup.py +.PHONY: bump +bump: + $(EDITOR) readability/__init__.py + $(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2)) + # fix first occurrence of version in pyproject.toml + sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml + git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py + git tag $(VERSION) + git push --tags diff --git a/README.md b/README.md new file mode 100644 index 00000000..e09a515a --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +[](https://pypi.python.org/pypi/readability-lxml) + +# python-readability + +Given an HTML document, extract and clean up the main body text and title. + +This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/). + +## Installation + +It's easy using `pip`, just run: + +```bash +$ pip install readability-lxml +``` + +As an alternative, you may also use conda to install, just run: + +```bash +$ conda install -c conda-forge readability-lxml +``` + +## Usage + +```python +>>> import requests +>>> from readability import Document + +>>> response = requests.get('http://example.com') +>>> doc = Document(response.content) +>>> doc.title() +'Example Domain' + +>>> doc.summary() +"""
This domain is established to be used for illustrative examples in documents. You may +use this\n domain in examples without prior coordination or asking for permission.
+\n \nThis domain is established to be used for illustrative examples in documents. You may - use this\n domain in examples without prior coordination or asking for permission.
- \n \n
+
+
foo" + long_space + "
" @@ -123,6 +149,7 @@ def test_utf8_kanji(self): sample = load_sample("utf-8-kanji.sample.html") doc = Document(sample) res = doc.summary() + assert 0 < len(res) < 10000 def test_author_present(self): sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html") @@ -133,3 +160,74 @@ def test_author_absent(self): sample = load_sample("si-game.sample.html") doc = Document(sample) assert '[no-author]' == doc.author() + + def test_keep_images_present(self): + sample = load_sample("summary-keep-all-images.sample.html") + + doc = Document(sample) + + assert "这是主要内容的第一段。
+これはコンテンツの第2段落です。
+이것은 콘텐츠의 세 번째 단락입니다.
+This is the fourth paragraph.
+