diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..8b16708 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,17 @@ +version: 2.1 +jobs: + build: + docker: + - image: cimg/python:3.12 + steps: + - checkout + - setup_remote_docker + - run: + name: Build image + command: docker build --target test -t ds-python . + - run: + name: Verify build completed + command: docker run ds-python /bin/bash -c "echo BUILDS OK" + - run: + name: Run tests + command: docker run ds-python /test_image.py -vv diff --git a/.circleci/test_image.py b/.circleci/test_image.py new file mode 100755 index 0000000..04e2df1 --- /dev/null +++ b/.circleci/test_image.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +import os +import re +import shutil +import subprocess +import unittest + + +# Just use the stdlib's `unittest` rather than needing to install `pytest`. +class TestImage(unittest.TestCase): + + def test_version(self): + version_in_env_var = os.getenv("VERSION") + major = os.getenv("VERSION_MAJOR") + minor = os.getenv("VERSION_MINOR") + micro = os.getenv("VERSION_MICRO") + self.assertTrue(major.isdigit()) + self.assertTrue(minor.isdigit()) + self.assertTrue(micro.isdigit()) + self.assertEqual(version_in_env_var, f"{major}.{minor}.{micro}") + + with open("CHANGELOG.md") as changelog: + version_in_changelog = re.search( + r"##\s+\[(\d+\.\d+\.\d+)]", changelog.read() + ).groups()[0] + self.assertEqual(version_in_changelog, version_in_env_var) + + def test_scipy_links_to_openblas(self): + from scipy.linalg import _fblas # noqa: F401 + + def test_numpy_can_import(self): + import numpy as np # noqa: F401 + + def test_sklearn_can_import(self): + import sklearn # noqa: F401 + + def test_civis_can_import(self): + import civis # noqa: F401 + # civis-python uses lazy imports since v2.3.0, + # so try to import the top-level modules. + import civis.io # noqa: F401 + import civis.parallel # noqa: F401 + import civis.futures # noqa: F401 + import civis.ml # noqa: F401 + import civis.utils # noqa: F401 + + def test_shell_commands_available(self): + """Ensure the main shell commands are available.""" + # A non-exhaustive list of commands -- we just test those we'd likely use. + expected_cmds = "aws civis curl git pip python unzip uv wget".split() + for cmd in expected_cmds: + self.assertIsNotNone(shutil.which(cmd), f"{cmd} not found in PATH") + + def _test_shell_command(self, cmd: str): + """Check if the shell command runs successfully in the image.""" + try: + subprocess.check_call(cmd, shell=True) + except subprocess.CalledProcessError as e: + self.fail( + f"apt-get test failed with return code {e.returncode}\n" + f"stdout: {e.stdout}\n" + f"stderr: {e.stderr}" + ) + + def test_apt_get(self): + """Ensure that apt-get works in the image.""" + self._test_shell_command("apt-get update -y && apt-get install -y htop") + + def test_uv(self): + """Ensure that uv works in the image.""" + self._test_shell_command("uv pip install python-iso639") + + +if __name__ == "__main__": + unittest.main() diff --git a/.condarc b/.condarc deleted file mode 100644 index 364cb1a..0000000 --- a/.condarc +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - defaults - -show_channel_urls: True - -create_default_packages: - - nomkl diff --git a/.github/ISSUE_TEMPLATE/general.md b/.github/ISSUE_TEMPLATE/general.md new file mode 100644 index 0000000..5e6ede5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/general.md @@ -0,0 +1,11 @@ +--- +name: General +about: Ask a question, report a potential issue, etc. +title: '' +labels: '' +assignees: '' + +--- + +**Note:** Civis employees should _not_ use the GitHub Issues feature at the public "civis-python" codebase +to file a ticket, and should instead use the internal ticketing system. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..83b1957 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,9 @@ + + +--- + +- [ ] (For Civis employees only) Reference to a relevant ticket in the pull request title +- [ ] Changelog entry added to `CHANGELOG.md` at the repo's root level +- [ ] Description of change in the pull request description +- [ ] If applicable, unit tests have been added and/or updated +- [ ] The CircleCI builds have all passed diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c1d6aa5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.venv +.vscode diff --git a/CHANGELOG.md b/CHANGELOG.md index 738fe07..c4a22c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,388 @@ All changes to this project will be documented in this file. Version number changes (major.minor.micro) in this package denote the following: - A micro version will increase if the only change in a release is incrementing micro versions (bugfix-only releases) on the packages contained in this image. - A minor version will increase if one or more packages contained in the Docker image add new, backwards-compatible features, or if a new package is added to the Docker image. -- A major version will increase if there are any backwards-incompatible changes in any of the packages contained in this Docker image, or any other backwards-incompabile changes in the execution environment. +- A major version will increase if there are any backwards-incompatible changes in any of the packages contained in this Docker image, or any other backwards-incompatible changes in the execution environment. ## Unreleased + +## [8.3.0] + +- Python version updated: 3.12.8 -> 3.13.5 +- uv version updated: 0.5.18 -> 0.7.19 +- New core dependencies added: + * polars 1.31.0 (supported by civis-python since v2.6.0) +- Core dependencies updated to latest versions: + * awscli 2.22.33 -> 2.27.48 + * boto3 1.35.97 -> 1.39.2 + * civis 2.4.3 -> 2.7.1 + * numpy 2.2.1 -> 2.3.1 + * pandas 2.2.3 -> 2.3.0 + * requests 2.32.3 -> 2.32.4 + * scikit-learn 1.6.1 -> 1.7.0 + * scipy 1.15.1 -> 1.16.0 + +## [8.2.0] + +- Python version updated: 3.12.7 -> 3.12.8 +- uv version updated: 0.5.1 -> 0.5.18 +- Core dependencies updated to latest versions: + * awscli 2.19.5 -> 2.22.33 + * boto3 1.35.58 -> 1.35.97 + * civis 2.4.0 -> 2.4.3 + * numpy 2.1.3 -> 2.2.1 + * scikit-learn 1.5.2 -> 1.6.1 + * scipy 1.14.1 -> 1.15.1 + +## [8.1.0] + +- Python version updated to v3.12.7 +- Core dependencies updated to latest versions: + * awscli 2.17.37 -> 2.19.5 + * boto3 1.34.127 -> 1.35.58 + * civis 2.3.0 -> 2.4.0 + * numpy 2.0.0 -> 2.1.3 + * pandas 2.2.2 -> 2.2.3 + * scikit-learn 1.5.0 -> 1.5.2 + * scipy 1.13.1 -> 1.14.1 +- uv added to the image + +## [8.0.1] + +- Python version updated to v3.12.6 + +## [8.0.0] +- Core dependencies updated to latest versions: + * awscli 1.33.9 -> 2.17.37 +- Python version updated to v3.12.5 +- Fixes apt-get for debian package installations + +## [7.3.0] +- Core dependencies updated to latest versions: + * awscli 1.32.112 -> 1.33.9 + * boto3 1.34.112 -> 1.34.127 + * civis 2.1.0 -> 2.3.0 + * numpy 1.26.4 -> 2.0.0 + * requests 2.32.2 -> 2.32.3 +- Python version updated to v3.12.4 + +## [7.2.0] +- Core dependencies updated to latest versions: + * awscli 1.32.109 -> 1.32.112 + * boto3 1.34.109 -> 1.34.112 + * civis 2.0.0 -> 2.1.0 + * scipy 1.13.0 -> 1.13.1 + +## [7.1.0] +- Python updated to v3.12.3 +- Core dependencies updated to latest versions: + * awscli 1.29.5 -> 1.32.109 + * boto3 1.28.5 -> 1.34.109 + * civis 1.16.1 -> 2.0.0 + * numpy 1.25.1 -> 1.26.4 + * pandas 2.0.3 -> 2.2.2 + * requests 2.31.0 -> 2.32.2 + * scikit-learn 1.3.0 -> 1.5.0 + * scipy 1.11.1 -> 1.13.0 + +## [7.0.0] +- Python updated to v3.11.4 +- Refactors Dockerfile to use the official Python docker image +- Removes Conda Dependency +- Keeps only the core data science python packages + +## [6.5.1] +### Package Updates +- Pin cffi at 1.14.0 (#85) + +## [6.5.0] +### Package Updates +- civis 1.15.1 -> 1.16.0 (#84) +- Removed pubnub - not considered a breaking change, as it was a dependency for `civis` (#84) + +## [6.4.0] +### Package Updates +- PyYAML 3.13 -> 5.2.0 + +## [6.3.1] +### Package Updates +- civis 1.15.0 -> 1.15.1 (#81) + +## [6.3.0] +### Package Updates +- civis 1.14.0 -> 1.15.0 (#79) +- tensorflow 1.15.2 -> 1.15.4 (#79) + +## [6.2.1] +### Package Updates +- civis 1.14.0 -> 1.14.1 (#78) + +## [6.2.0] +### Package Updates +- civis 1.13.0 -> 1.14.0 (#77) +- muffnn 2.3.0 -> 2.3.1 (#77) +- tensorflow 1.13.1 -> 1.15.2 (#77) + +## [6.1.0] +### Package Updates +- civis 1.12.1 -> 1.13.0 (#76) + +## [6.0.0] +### Changed +- Python 3.7.1 -> 3.7.6 (#74) +- Conda 4.6.8 -> 4.8.1 (#74) + +### Removed Packages +- removes python-simple-hipchat (#75) + +### New Packages +- explicitly installs pip=20.0.2 (#75) + +### Package Updates +- awscli 1.16.121 -> 1.17.15 (#75) +- beautifulsoup4 4.7.1 -> 4.8.2 (#75) +- botocore 1.12.111 -> 1.14.15 (#75) +- boto3 1.9.111 -> 1.11.15 (#75) +- bqplot 0.11.5 -> 0.12.3 (#75) +- civis 1.9.4 -> 1.12.1 (#74) +- civisml-extensions 0.1.10 -> 0.2.1 (#74) +- cloudpickle 0.8.0 -> 1.2.2 (#74) +- cython 0.29.6 -> 0.29.15 (#75) +- dask 1.1.4 -> 2.10.1 (#75) +- dropbox 9.3.0 -> 9.4.0 (#75) +- ipython 7.3.0 -> 7.12.0 (#75) +- ipywidgets 7.4.2 -> 7.5.1 (#75) +- jinja2 2.10 -> 2.11.1 (#75) +- joblib 0.11.0 -> 0.14.1 (#74) +- jsonschema 3.0.1 -> 3.2.0 (#75) +- libtiff 4.0.10 -> 4.1.0 (#75) +- libxml2 2.9.8 -> 2.9.10 (#75) +- matplotlib 3.0.3 -> 3.1.3 (#75) +- muffnn 2.2.0 -> 2.3.0 (#74) +- nomkl 1.0 -> 3.0 (#74) +- notebook 5.7.5 -> 6.0.3 (#75) +- numexpr 2.6.9 -> 2.7.1 (#75) +- numpy 1.16.2 -> 1.17.3 (#74) +- openblas 0.3.5 -> 0.3.6 (#68, #70) +- pandas 0.24.1 -> 0.25.3 (#74) +- psycopg2 2.7.7 -> 2.8.4 (#75) +- pubnub 4.1.2 -> 4.3.0 (#75) +- pyarrow 0.12.1 -> 0.16.0 (#75) +- pytest 4.3.0 -> 5.3.5 (#75) +- requests 2.21.0 -> 2.22.0 (#75) +- s3fs 0.2.0 -> 0.4.0 (#75) +- scipy 1.2.0 -> 1.4.1 (#74) +- scikit-learn 0.19.2 -> 0.22.1 (#74) +- seaborn 0.9.0 -> 0.10.0 (#75) +- statsmodels 0.9.0 -> 0.11.0 (#75) +- urllib3 1.24.1 -> 1.25.7 (#75) + +### Added +- added buildspecs for autobuilding and pushing Docker image to Amazon ECR (#69) + +## [5.0.0] - 2019-03-12 +### Changed +- Ubuntu 14.04 -> 18.04 (#67) +- python 3.6.4 -> 3.7.1 +- conda 4.3.30 -> 4.6.8 + +### New Packages +- explicitly installs click=6.7 + +### Package Updates +- awscli 1.15.4 -> 1.16.121 +- beautifulsoup4 4.5.3 -> 4.7.1 +- botocore 1.10.4 -> 1.12.111 +- boto 2.46.1 -> 2.49.0 +- boto3 1.7.4 -> 1.9.111 +- bqplot 0.10.2 -> 0.11.5 +- cloudpickle 0.5.2 -> 0.8.0 +- cython 0.27.3 -> 0.29.6 +- dask 0.17.2 -> 1.1.4 +- ipython 6.1.0 -> 7.3.0 +- ipywidgets 7.1.0 -> 7.4.2 +- jinja2 2.9.6 -> 2.10 +- jsonschema 2.5.1 -> 3.0.1 +- libtiff 4.0.6 -> 4.0.10 +- libxml2 2.9.2 -> 2.9.8 +- matplotlib 2.2.2 -> 3.0.3 +- notebook 5.4.1 -> 5.7.5 +- numexpr 2.6.2 -> 2.6.9 +- numpy 1.13.3 -> 1.16.2 +- openblas 0.2.20 -> 0.3.5 +- pandas 0.22.0 -> 0.24.1 +- patsy 0.4.1 -> 0.5.1 +- psycopg2 2.6.2 -> 2.7.7 +- pyarrow 0.8.0 -> 0.12.1 +- pytest 3.1.3 -> 4.3.0 +- pyyaml 3.12 -> 3.13 +- requests 2.18.4 -> 2.21.0 +- s3fs 0.1.2 -> 0.2.0 +- seaborn 0.8 -> 0.9.0 +- scipy 1.0.1 -> 1.2.0 +- scikit-learn 0.19.1 -> 0.19.2 +- statsmodels 0.8.0 -> 0.9.0 +- urllib3 1.22 -> 1.24.1 +- xgboost 0.6a2 -> 0.81 +- civis 1.9.0 -> 1.9.4 +- civisml-extensions 0.1.8 -> 0.1.10 +- dropbox 7.1.1 -> 9.3.0 +- glmnet 2.0.0 -> 2.1.1 +- muffnn 2.1.0 -> 2.2.0 +- pubnub 4.0.13 -> 4.1.2 +- requests-toolbelt 0.8.0 -> 0.9.1 +- tensorflow 1.7.0 -> 1.13.1 + +### Maintenance +- Update CircleCI config to v2 (#62). +- Test that tensorflow imports successfully (#67). + +## [4.2.0] - 2018-04-26 +### Package Updates +- civis 1.8.1 -> 1.9.0 +- civisml-extensions 0.1.6 -> 0.1.8 +- muffnn 2.0.0 -> 2.1.0 + +- dask 0.15.4 (pip) -> 0.17.2 (conda) +- tensorflow 1.4.1 -> 1.7.0 +- ipython 6.1.0 -> 6.3.1 +- matplotlib 2.1.0 -> 2.2.2 +- notebook 5.2.2 -> 5.4.1 +- scipy 1.0.0 -> 1.0.1 +- urllib3 1.22 (pip) -> 1.22 (conda) + +## [4.1.0] - 2018-04-19 +### Added +- Added a link in the README directing users who may be reading documentation on DockerHub to instead go to GitHub (#56). + +### Package Updates +- awscli 1.11.75 (from pip) -> 1.15.4 (from conda) +- botocore 1.5.38 -> 1.10.4 +- boto3 1.5.11 -> 1.7.4 + +## [4.0.1] - 2018-02-01 +### Package Updates +- civis 1.8.0 -> 1.8.1 + +## [4.0.0] - 2018-01-23 +### New packages +- bqplot 0.10.2 +- feather-format 0.4.0 + +### Changed +- Updated Python version from 3.6.2 to 3.6.4. + +### Package Updates +- civis 1.7.1 -> 1.8.0 +- civisml-extensions 0.1.5 -> 0.1.6 +- muffnn 1.2.0 -> 2.0.0 +- cloudpickle 0.5.1 -> 0.5.2 +- dask 0.15.4 -> 0.16.1 +- ftputil 3.3.1 -> 3.4 +- tensorflow 1.4.0 -> 1.4.1 +- boto3 1.4.5 -> 1.5.11 +- cython 0.26 -> 0.27.3 +- openblas 0.2.19 -> 0.2.20 +- pandas 0.21.0 -> 0.22.0 +- pyarrow 0.7.1 -> 0.8.0 +- scipy 0.19.1 -> 1.0.0 +- ipywidgets 7.0.0 -> 7.1.0 +- notebook 5.2.0 -> 5.2.2 + +### Fixed +- Enabled widgetsnbextension so that ipywidgets works. +- Suppress irrelevant warning from tensorflow v1.4 + +## [3.3.0] - 2017-11-17 +### Package Updates +- civis 1.6.2 -> 1.7.1 + +### New packages +- civisml-extensions 0.1.5 +- dask 0.15.4 +- s3fs 0.1.2 + +### Changed +- Moved conda to version 4.3.30 + +### Package Updates +- boto3 1.4.4 -> 1.4.5 +- matplotlib 2.0.2 -> 2.1.0 +- numpy 1.13.1 -> 1.13.3 +- pandas 0.20.3 -> 0.21.0 +- pyarrow 0.5.0 -> 0.7.1 +- scikit-learn 0.19.0 -> 0.19.1 +- cloudpickle 0.3.1 -> 0.5.1 +- muffnn 1.1.2 -> 1.2.0 +- pubnub 4.0.12 -> 4.0.13 +- tensorflow 1.2.1 -> 1.4.0 + +## [3.2.0] - 2017-09-11 +### Package Updates +- scikit-learn 0.18.2 -> 0.19.0 +- civis 1.6.0 -> 1.6.2 +- requests 2.14.2 -> 2.18.4 +- urllib3 1.19 -> 1.22 + +## [3.1.0] - 2017-07-31 +### New packages +- cloudpickle 0.3.1 +- pyarrow 0.5.0 (from conda-forge) + +### Python +- Update from v3.6.1 to v3.6.2 + +### Package Updates +- civis 1.5.2 -> 1.6.0 +- cython 0.25.2 -> 0.26 +- ipython 6.0.0 -> 6.1.0 +- jinja2 2.8 -> 2.9.6 +- numpy 1.12.1 -> 1.13.1 +- pandas 0.20.1 -> 0.20.3 +- pytest 3.0.5 -> 3.1.3 +- seaborn 0.7.1 -> 0.8 +- scipy 0.19.0 -> 0.19.1 +- scikit-learn 0.18.1 -> 0.18.2 +- pubnub 4.0.10 -> 4.0.12 +- requests-toolbelt 0.7.1 -> 0.8.0 +- tensorflow 1.1.0 -> 1.2.1 + +### Changed +- Install xgboost from conda-forge instead of from PyPI + +### Fixes +- Use /tmp for joblib temporary files instead of /shm + + +## [3.0.1] - 2017-05-25 +### Package Updates +- muffnn 1.1.1 -> 1.1.2 + +## [3.0.0] - 2017-05-17 +### Package updates +- civis 1.4.0 -> 1.5.2 +- ipython 5.1.0 -> 6.0.0 +- matplotlib 2.0.0 -> 2.0.2 +- pandas 0.19.2 -> 0.20.1 +- requests 2.13.0 -> 2.14.2 + +## [2.2.0] - 2017-05-02 ### Removed - Remove pinned conda installs of `libgcc` and `libsodium`. This prevented use of the environment file in OS X, and they are dependencies automatically installed by conda in the Docker image build. +### Additions +- Explicitly added `botocore` v1.5.38. We had `botocore` installed before (it's a dependency of other AWS libraries), but we're now explicitly including the version number. + +### Package updates +- python 3.6.0 -> 3.6.1 +- awscli 1.11.60 -> 1.11.75 +- boto 2.45.0 -> 2.46.1 +- boto3 1.4.3 -> 1.4.4 +- numpy 1.12.0 -> 1.12.1 +- pubnub 4.0.8 -> 4.0.10 +- requests 2.12.4 -> 2.13.0 +- scipy 0.18.1 -> 0.19.0 +- muffnn 1.0.0 -> 1.1.1 +- tensorflow 1.0.0 -> 1.1.0 ## [2.1.0] - 2017-03-17 ### Changed diff --git a/Dockerfile b/Dockerfile index e892a38..0e9ab9b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,98 +1,70 @@ -FROM ubuntu:14.04 -MAINTAINER support@civisanalytics.com +ARG PLATFORM=linux/x86_64 +ARG BASE_IMAGE=python:3.13.5-slim -# Ensure UTF-8 locale. -RUN locale-gen en_US.UTF-8 +FROM --platform=$PLATFORM $BASE_IMAGE AS uv-installed -# Set environment variables for UTF-8, conda, and shell environments -ENV LANG=en_US.UTF-8 \ - LANGUAGE=en_US:en \ - LC_ALL=en_US.UTF-8 \ - CONDARC=/opt/conda/.condarc \ - BASH_ENV=/etc/profile \ - PATH=/opt/conda/bin:$PATH \ - CIVIS_CONDA_VERSION=4.3.11 \ - CIVIS_PYTHON_VERSION=3.6.0 +# Disable pip warnings https://stackoverflow.com/a/72551258 +ENV PIP_ROOT_USER_ACTION=ignore + +LABEL maintainer=support@civisanalytics.com RUN DEBIAN_FRONTEND=noninteractive apt-get update -y --no-install-recommends && \ + apt-get install -y --no-install-recommends locales && \ + locale-gen en_US.UTF-8 && \ apt-get install -y --no-install-recommends software-properties-common && \ apt-get install -y --no-install-recommends \ - make \ - automake \ - libpq-dev \ - libffi-dev \ - gfortran \ - g++ \ - git \ - libboost-program-options-dev \ - libtool \ - libxrender1 \ - wget \ - ca-certificates \ - curl && \ + make \ + automake \ + libpq-dev \ + libffi-dev \ + gfortran \ + g++ \ + git \ + libboost-program-options-dev \ + libtool \ + libxrender1 \ + wget \ + ca-certificates \ + curl \ + mandoc \ + unzip && \ apt-get clean -y && \ rm -rf /var/lib/apt/lists/* -# Conda install. -# -# Everything is installed in the root environment. This allows for -# upgrades to the packages and eliminates the pain of trying to activate -# some other environment automatically for the many different ways -# people can use a docker image. -# -# Things are pinned to prevent upgrades from conda and force it to -# resolve dependencies relative to a fixed conda & python version. -# -# Note that the python version is also listed in the enviornment.yml -# file. The version in CIVIS_PYTHON_VERSION is the source of truth. -# If you want to change the python version, you need to change it in -# **both** places. The python version has been left in the `environment.yml` -# file so that people can create environments equivalent to this -# container. -# -# The ordering of these steps seems to matter. You seem to have to -# install a specific python version by hand and then pin it. -# 1) install conda -# 2) pin conda to the version given by CIVIS_CONDA_VERSION -# 3) install the python version CIVIS_PYTHON_VERSION -# 4) pin the python version -# -# Extra symlinks are added at the end because... -# Red Hat and Debian use different names for this file. git2R wants the latter. -# See conda-recipes GH 423 -RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ - wget --quiet https://repo.continuum.io/miniconda/Miniconda3-${CIVIS_CONDA_VERSION}-Linux-x86_64.sh && \ - /bin/bash /Miniconda3-${CIVIS_CONDA_VERSION}-Linux-x86_64.sh -b -p /opt/conda && \ - rm Miniconda3-${CIVIS_CONDA_VERSION}-Linux-x86_64.sh && \ - /opt/conda/bin/conda install --yes conda==${CIVIS_CONDA_VERSION} && \ - echo "conda ==${CIVIS_CONDA_VERSION}" > /opt/conda/conda-meta/pinned && \ - conda install --yes python==${CIVIS_PYTHON_VERSION} && \ - echo "python ==${CIVIS_PYTHON_VERSION}" >> /opt/conda/conda-meta/pinned && \ - conda clean --all -y && \ - ln -s /opt/conda/lib/libopenblas.so /opt/conda/lib/libblas.so && \ - ln -s /opt/conda/lib/libopenblas.so /opt/conda/lib/liblapack.so && \ - ln -s /opt/conda/lib/libssl.so /opt/conda/lib/libssl.so.6 && \ - ln -s /opt/conda/lib/libcrypto.so /opt/conda/lib/libcrypto.so.6 +# Install uv. +ADD https://astral.sh/uv/0.7.19/install.sh /uv-installer.sh +RUN sh /uv-installer.sh && rm /uv-installer.sh +ENV PATH="/root/.local/bin/:$PATH" \ + UV_SYSTEM_PYTHON=1 + +# This is the primary build target used for the production image +FROM --platform=$PLATFORM uv-installed AS production + +COPY requirements-full.txt . + +RUN uv pip install --no-progress --no-cache -r requirements-full.txt && \ + rm requirements-full.txt + +# Instruct joblib to use disk for temporary files. Joblib defaults to +# /shm when that directory is present. In the Docker container, /shm is +# present but defaults to 64 MB. +# https://github.com/joblib/joblib/blob/0.11/joblib/parallel.py#L328L342 +ENV JOBLIB_TEMP_FOLDER=/tmp + +ENV VERSION=8.3.0 \ + VERSION_MAJOR=8 \ + VERSION_MINOR=3 \ + VERSION_MICRO=0 -# Install boto in the base environment for private s3 channel support. -# Install Python Packages -COPY .condarc /opt/conda/.condarc -COPY environment.yml environment.yml -RUN conda install -y boto && \ - conda install -y nomkl && \ - conda env update -f environment.yml -n root && \ - conda clean --all -y && \ - rm -rf ~/.cache/pip +# This build target is for testing in CircleCI. +FROM --platform=$PLATFORM production AS test +COPY .circleci/test_image.py . +COPY CHANGELOG.md . -# We aren't running a GUI, so force matplotlib to use -# the non-interactive "Agg" backend for graphics. -# Run matplotlib once to build the font cache. -ENV MATPLOTLIBRC=${HOME}/.config/matplotlib/matplotlibrc -RUN mkdir -p ${HOME}/.config/matplotlib && \ - echo "backend : Agg" > ${HOME}/.config/matplotlib/matplotlibrc && \ - python -c "import matplotlib.pyplot" +# This build target is for updating dependencies. +# See generate-requirements.full.sh. +FROM --platform=$PLATFORM uv-installed AS update-deps +CMD ["/bin/bash"] -ENV VERSION=2.1.0 \ - VERSION_MAJOR=2 \ - VERSION_MINOR=1 \ - VERSION_MICRO=0 +# Default to the production build target. +FROM production diff --git a/README.md b/README.md index a084091..ea97734 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![CircleCI](https://circleci.com/gh/civisanalytics/datascience-python/tree/master.svg?style=svg)](https://circleci.com/gh/civisanalytics/datascience-python/tree/master) -This image is created from the official Ubuntu 14.04 Docker image and contains popular Python packages for data science. +If you are reading this README on DockerHub, then the links to files in the GitHub repository will be broken. Please read this documentation from [GitHub](https://github.com/civisanalytics/datascience-python) instead. # Introduction @@ -33,7 +33,7 @@ to retrieve a reproducible environment. Inside the datascience-python Docker image, Python packages are installed in the `root` environment. For a full list of included Python libraries, see the -[environment.yml](environment.yml) file. +[requirements-core.txt](requirements-core.txt) file. To start a Docker container from the datascience-python image and interact with it from a bash prompt, use @@ -54,21 +54,26 @@ VERSION_MAJOR VERSION_MINOR VERSION_MICRO ``` -VERSION contains the full version string, e.g. "1.0.3". VERSION_MAJOR, +VERSION contains the full version string, e.g., "1.0.3". VERSION_MAJOR, VERSION_MINOR, and VERSION_MICRO each contain a single integer. -# Creating Equivalent Local Environments +## Joblib Temporary Files + +The [`joblib`](https://pythonhosted.org/joblib/) library enhances multiprocessing +capabilities for scientific Python computing. In particular, the `scikit-learn` +library uses `joblib` for parallelization. This Docker image sets `joblib`'s +default location for staging temporary files to the /tmp directory. +The normal default is /shm. /shm is a RAM disk which defaults to a 64 MB size +in Docker containers, too small for typical scientific computing. -The `environment.yml` file in this repo can be used to create a python environment that is -equivalent to the one in the container. This environment will be named `datascience`. -The environment installs in Ubuntu Linux (this is the OS of the Dockerfile). -It will install in OS X, but the `xgboost` install requires either -the `gcc` v5 or the `clang-omp` compiler, neither of which are natively provided in OS X. -If you wish to set up this environment in OS X, you may either -- Remove `xgboost` from the `environment.yml` file before using it to create the environment -- Use [Homebrew](https://brew.sh/) to install `gcc-5`. You can do that via -`brew install gcc@5 --without-multilib`. Be warned that this installation will take -a long time. +# Updating Existing Package Versions +1. Update versions of existing packages in `requirements-core.txt` +2. Run script `generate-requirements-full.sh` + +# Creating Equivalent Local Environments +1. Create a new python environment `python -m venv .venv`. +2. Activate your new python environment `source .venv/bin/activate` +3. Install requirements.txt `pip install -r requirements-full.txt` # Contributing @@ -83,9 +88,9 @@ and describe any changes in the [change log](CHANGELOG.md). ## For Maintainers This repo has autobuild enabled. Any PR that is merged to master will -be built as the `latest` tag on Dockerhub. +be built as the `latest` tag on DockerHub. Once you are ready to create a new version, go to the "releases" tab of the repository and click -"Draft a new release". Github will prompt you to create a new tag, release title, and release +"Draft a new release". GitHub will prompt you to create a new tag, release title, and release description. The tag should use semantic versioning in the form "vX.X.X"; "major.minor.micro". The title of the release should be the same as the tag. Include a change log in the release description. Once the release is tagged, DockerHub will automatically build three identical containers, with labels diff --git a/buildspec/merge_master.yaml b/buildspec/merge_master.yaml new file mode 100644 index 0000000..27fcd9d --- /dev/null +++ b/buildspec/merge_master.yaml @@ -0,0 +1,14 @@ +version: 0.2 +phases: + pre_build: + commands: + - echo Logging in to Amazon ECR... + - aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | docker login --username AWS --password-stdin ${FIPS_REPOSITORY_URI} + build: + commands: + - echo Building the Docker image... + - docker build -t ${FIPS_REPOSITORY_URI}:latest . + - docker image push --all-tags ${FIPS_REPOSITORY_URI} + post_build: + commands: + - echo Build completed! diff --git a/buildspec/push.yaml b/buildspec/push.yaml new file mode 100644 index 0000000..0f10bc4 --- /dev/null +++ b/buildspec/push.yaml @@ -0,0 +1,19 @@ +version: 0.2 +phases: + build: + commands: + - echo Logging in to Amazon ECR... + - aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | docker login --username AWS --password-stdin ${FIPS_REPOSITORY_URI} + - export COMMIT_HASH_SHORT="$(echo $COMMIT_HASH | cut -c 1-7)" + - echo Building the Docker image... + - echo $FIPS_REPOSITORY_URI + - echo $COMMIT_HASH_SHORT + - echo $BRANCH_NAME + - docker build --tag ${FIPS_REPOSITORY_URI}:${COMMIT_HASH_SHORT} --tag ${FIPS_REPOSITORY_URI}:${BRANCH_NAME} . + # We have a life cycle policy in place to expire and delete images from dev branches, + # so there are no issues with pushing as many of these images as there may be. + - docker image push --all-tags ${FIPS_REPOSITORY_URI} + post_build: + commands: + - echo Build completed! + - printf '{"tag":"%s"}' $COMMIT_HASH_SHORT > build.json diff --git a/buildspec/release.yaml b/buildspec/release.yaml new file mode 100644 index 0000000..add6384 --- /dev/null +++ b/buildspec/release.yaml @@ -0,0 +1,17 @@ + +version: 0.2 +phases: + build: + commands: + - echo Logging in to Amazon ECR... + - aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | docker login --username AWS --password-stdin ${FIPS_REPOSITORY_URI} + - echo Building the Docker image... + - PATCH_TAG=${TAG_NAME#"v"} # major.minor.patch + - MINOR_TAG=${PATCH_TAG%.*} # major.minor + - MAJOR_TAG=${MINOR_TAG%.*} # major + - docker build -t ${FIPS_REPOSITORY_URI}:${PATCH_TAG} -t ${FIPS_REPOSITORY_URI}:${MINOR_TAG} -t ${FIPS_REPOSITORY_URI}:${MAJOR_TAG} . + - docker image push --all-tags ${FIPS_REPOSITORY_URI} + post_build: + commands: + - echo Build completed! + - printf '{"tag":"%s"}' $TAG_NAME > build.json diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 3ea7fa4..0000000 --- a/circle.yml +++ /dev/null @@ -1,17 +0,0 @@ -machine: - services: - - docker - -dependencies: - override: - - docker build -t civisanalytics/datascience-python . - -test: - override: - - docker run civisanalytics/datascience-python /bin/bash -c "echo BUILDS OK" - - docker run civisanalytics/datascience-python python -c "from scipy.linalg import _fblas" - - docker run civisanalytics/datascience-python python -c "import numpy, os; import matplotlib.pyplot as plt; x = numpy.arange(100); y = numpy.sin(x); plt.plot(x, y);" - - docker run civisanalytics/datascience-python python -c "import seaborn" - - docker run -t civisanalytics/datascience-python /bin/bash -c "python -c 'import numpy'" - - docker run civisanalytics/datascience-python python -c "from numpy.distutils import system_info; assert system_info.get_info('mkl') == {}" - - docker run civisanalytics/datascience-python python -c "import numpy; numpy.test()" diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..3742956 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,11 @@ +services: + update-deps: + build: + context: . + dockerfile: ./Dockerfile + target: update-deps + volumes: + - .:/app + stdin_open: true + tty: true + working_dir: /app diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 0540b2a..0000000 --- a/environment.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: datascience -dependencies: -- beautifulsoup4=4.5.3 -- boto=2.45.0 -- boto3==1.4.3 -- cython=0.25.2 -- ipython=5.1.0 -- jinja2=2.8 -- jsonschema=2.5.1 -- jupyter=1.0.0 -- libffi=3.2.1 -- libgfortran=3.0.0 -- libtiff=4.0.6 -- libxml2=2.9.2 -- matplotlib=2.0.0 -- nomkl=1.0 -- nose=1.3.7 -- numexpr=2.6.2 -- numpy=1.12.0 -- openblas=0.2.19 -- pandas=0.19.2 -- patsy=0.4.1 -- psycopg2=2.6.2 -- pycrypto=2.6.1 -- pytest=3.0.5 -- python=3.6.0 -- pyyaml=3.12 -- requests=2.12.4 -- seaborn=0.7.1 -- scipy=0.18.1 -- scikit-learn=0.18.1 -- statsmodels=0.8.0 -- pip: - - awscli==1.11.60 - - civis==1.4.0 - - dropbox==7.1.1 - - ftputil==3.3.1 - - glmnet==2.0.0 - - joblib==0.11.0 - - muffnn==1.0.0 - - pubnub==4.0.8 - - pysftp==0.2.9 - - python-simple-hipchat==0.4.0 - - requests-toolbelt==0.7.1 - - tensorflow==1.0.0 - - urllib3==1.19 - - xgboost==0.6a2 diff --git a/generate-requirements-full.sh b/generate-requirements-full.sh new file mode 100755 index 0000000..40c7889 --- /dev/null +++ b/generate-requirements-full.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# Run this script to update requirements-core.txt. +# It uses Docker to ensure that the environment matches what will be used in the production image. +set -e +docker compose run --rm update-deps /bin/sh -c "uv pip compile --output-file=requirements-full.txt --upgrade requirements-core.txt" diff --git a/requirements-core.txt b/requirements-core.txt new file mode 100644 index 0000000..fb90216 --- /dev/null +++ b/requirements-core.txt @@ -0,0 +1,12 @@ +# awscli v2 is not officially available on PyPI (https://github.com/aws/aws-cli/issues/4947). +# Specifying awscli in requirements-core.txt here ensures that it (and its transitive dependencies) +# are taken into account when generating the final requirements-full.txt file. +awscli @ git+https://github.com/aws/aws-cli@2.27.48 +boto3==1.39.2 +civis==2.7.1 +numpy==2.3.1 +pandas==2.3.0 +polars==1.31.0 +requests==2.32.4 +scikit-learn==1.7.0 +scipy==1.16.0 diff --git a/requirements-full.txt b/requirements-full.txt new file mode 100644 index 0000000..1fd81f5 --- /dev/null +++ b/requirements-full.txt @@ -0,0 +1,109 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile --output-file=requirements-full.txt requirements-core.txt +attrs==25.3.0 + # via + # jsonschema + # referencing +awscli @ git+https://github.com/aws/aws-cli@6804a17061546394f88e1fe6d1bf9b24cd8a09ec + # via -r requirements-core.txt +awscrt==0.26.1 + # via awscli +boto3==1.39.2 + # via -r requirements-core.txt +botocore==1.39.2 + # via + # boto3 + # s3transfer +certifi==2025.6.15 + # via requests +charset-normalizer==3.4.2 + # via requests +civis==2.7.1 + # via -r requirements-core.txt +click==8.2.1 + # via civis +cloudpickle==3.1.1 + # via civis +colorama==0.4.6 + # via awscli +distro==1.8.0 + # via awscli +docutils==0.19 + # via awscli +idna==3.10 + # via requests +jmespath==1.0.1 + # via + # awscli + # boto3 + # botocore +joblib==1.5.1 + # via + # civis + # scikit-learn +jsonref==1.1.0 + # via civis +jsonschema==4.24.0 + # via civis +jsonschema-specifications==2025.4.1 + # via jsonschema +numpy==2.3.1 + # via + # -r requirements-core.txt + # pandas + # scikit-learn + # scipy +pandas==2.3.0 + # via -r requirements-core.txt +polars==1.31.0 + # via -r requirements-core.txt +prompt-toolkit==3.0.38 + # via awscli +python-dateutil==2.9.0 + # via + # awscli + # botocore + # pandas +pytz==2025.2 + # via pandas +pyyaml==6.0.2 + # via civis +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications +requests==2.32.4 + # via + # -r requirements-core.txt + # civis +rpds-py==0.26.0 + # via + # jsonschema + # referencing +ruamel-yaml==0.17.21 + # via awscli +ruamel-yaml-clib==0.2.12 + # via awscli +s3transfer==0.13.0 + # via boto3 +scikit-learn==1.7.0 + # via -r requirements-core.txt +scipy==1.16.0 + # via + # -r requirements-core.txt + # scikit-learn +six==1.17.0 + # via python-dateutil +tenacity==9.1.2 + # via civis +threadpoolctl==3.6.0 + # via scikit-learn +tzdata==2025.2 + # via pandas +urllib3==1.26.20 + # via + # awscli + # botocore + # requests +wcwidth==0.2.13 + # via prompt-toolkit