diff --git a/vulnerabilities/importer_yielder.py b/vulnerabilities/importer_yielder.py index b20db0453..192d173dd 100644 --- a/vulnerabilities/importer_yielder.py +++ b/vulnerabilities/importer_yielder.py @@ -172,7 +172,6 @@ 'db_url': 'https://usn.ubuntu.com/usn-db/database-all.json.bz2' }, }, - { 'name': 'github', 'license': '', @@ -182,7 +181,16 @@ 'endpoint': 'https://api.github.com/graphql', 'ecosystems': ['MAVEN', 'NUGET', 'COMPOSER'] } - } + }, + { + 'name': 'msr2019', + 'license': '', + 'last_run': None, + 'data_source': 'ProjectKBMSRDataSource', + 'data_source_cfg': { + 'etag': {} + }, + }, ] diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index ef327dceb..c41a0b279 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -38,3 +38,4 @@ from vulnerabilities.importers.ubuntu_usn import UbuntuUSNDataSource from vulnerabilities.importers.github import GitHubAPIDataSource from vulnerabilities.importers.nvd import NVDDataSource +from vulnerabilities.importers.project_kb_msr2019 import ProjectKBMSRDataSource diff --git a/vulnerabilities/importers/project_kb_msr2019.py b/vulnerabilities/importers/project_kb_msr2019.py new file mode 100644 index 000000000..0cd0b5d78 --- /dev/null +++ b/vulnerabilities/importers/project_kb_msr2019.py @@ -0,0 +1,96 @@ +# Copyright (c) nexB Inc. and others. All rights reserved. +# http://nexb.com and https://github.com/nexB/vulnerablecode/ +# The VulnerableCode software is licensed under the Apache License version 2.0. +# Data generated with VulnerableCode require an acknowledgment. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# When you publish or redistribute any data created with VulnerableCode or any VulnerableCode +# derivative work, you must accompany this data with the following acknowledgment: +# +# Generated with VulnerableCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# VulnerableCode should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# VulnerableCode is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/vulnerablecode/ for support and download. + +import csv +import dataclasses +import urllib.request + +# Reading CSV file from a url using `requests` is bit too complicated. +# Use `urllib.request` for that purpose. Need `requests` because making +# a HEADER request using `urllib.request` is too complicated. +import requests +from packageurl import PackageURL + + +from vulnerabilities.data_source import Advisory +from vulnerabilities.data_source import DataSource +from vulnerabilities.data_source import Reference +from vulnerabilities.data_source import DataSourceConfiguration + + +@dataclasses.dataclass +class ProjectKBDataSourceConfiguration(DataSourceConfiguration): + etag: dict + + +class ProjectKBMSRDataSource(DataSource): + + CONFIG_CLASS = ProjectKBDataSourceConfiguration + + url = "https://raw.githubusercontent.com/SAP/project-kb/master/MSR2019/dataset/vulas_db_msr2019_release.csv" # nopep8 + + def updated_advisories(self): + # etag are like hashes of web responses. We maintain + # (url, etag) mappings in the DB. `create_etag` creates + # (url, etag) pair. If a (url, etag) already exists then the code + # skips processing the response further to avoid duplicate work + if self.create_etag(self.url): + raw_data = self.fetch() + advisories = self.to_advisories(raw_data) + return self.batch_advisories(advisories) + + return [] + + def create_etag(self, url): + etag = requests.head(url).headers.get("ETag") + if not etag: + return True + + elif url in self.config.etag: + if self.config.etag[url] == etag: + return False + + self.config.etag[url] = etag + return True + + def fetch(self): + response = urllib.request.urlopen(self.url) + lines = [l.decode("utf-8") for l in response.readlines()] + return csv.reader(lines) + + @staticmethod + def to_advisories(csv_reader): + # Project KB MSR csv file has no header row + advsiories = [] + for row in csv_reader: + vuln_id, proj_home, fix_commit, _ = row + commit_link = proj_home + "/commit/" + fix_commit + advsiories.append( + Advisory( + summary="", + impacted_package_urls=[], + vuln_references=[Reference(url=commit_link)], + cve_id=vuln_id, + ) + ) + + return advisories