Coverage for rust2rpm/inspect.py: 95%
99 statements
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-26 13:52 +0100
« prev ^ index » next coverage.py v7.6.7, created at 2024-11-26 13:52 +0100
1"""Module containing functionality for inspecting contents of crate files / tarballs."""
3import contextlib
4import os
5import re
6import tarfile
7import tempfile
8from collections.abc import Generator
9from pathlib import Path
10from typing import Any
12from rust2rpm import log
13from rust2rpm.conf import FileInEx, TomlConf
15LICENSE_FILE_PATTERN = re.compile(
16 r"""
17 COPYING(?:[.-].*)?|COPYRIGHT(?:[.-].*)?|
18 EULA(?:[.-].*)?|[Ll]icen[cs]e(?![.]rs$).*|
19 (?:.*[.-])?(?:UN)?LICEN[CS]E(?:[.-].*)?|NOTICE(?:[.-].*)?|
20 PATENTS(?:[.-].*)?|
21 (?:agpl|l?gpl)[.-].*|CC-BY-.*|
22 (?:AGPL|APACHE|BSD|GFDL|GNU|L?GPL|MIT|MPL|OFL)-.*[0-9].*
23 """,
24 re.VERBOSE,
25)
26"""Regular expression for heuristically determining files that contain license
27texts based on the file name."""
29LICENSE_EXCLUDE_DIRS = {
30 "target",
31 "vendor",
32 "example",
33 "examples",
34 "_example",
35 "_examples",
36 "testdata",
37 "_testdata",
38 ".github",
39 "tests",
40 "test",
41}
42"""List of directories that are ignored when crawling project sources for
43license files."""
45DOC_FILE_PATTERN = re.compile(
46 r"""
47 .*\.(?:md|markdown|mdown|mkdn|rst|txt)|AUTHORS|
48 AUTHORS[.-].*|CONTRIBUTORS|CONTRIBUTORS[.-].*|README|
49 README[.-].*|CHANGELOG|CHANGELOG[.-].*|TODO|TODO[.-].*
50 """,
51 re.IGNORECASE | re.VERBOSE,
52)
53"""Regular expression for heuristically determining documentation files based
54on the file name."""
56DOC_FILE_EXCLUDES = re.compile(r"CMakeLists\.txt|versions\.txt|.*\.tpl|.*\.in")
57"""Regular expression used to ignore files that look like documentation but
58actually are not when crawling project sources."""
61class UnsafeTarballError(Exception):
62 """Raised if the downloaded crate file is a tarball with unsafe contents."""
65@contextlib.contextmanager
66def files_from_crate(
67 crate_path: Path,
68 crate_name: str,
69 crate_version: str,
70 tomlconf: TomlConf,
71) -> Generator[tuple[Path, list[str], list[str]], Any, None]:
72 """Unpack the crate at the given path and inspect its contents.
74 Arguments:
75 crate_path: Path to the crate file / tarball.
76 crate_name: Name of the crate.
77 crate_version: Version of the crate.
78 tomlconf: Global rust2rpm configuration, which contains settings for
79 filtering / overriding heuristics for license and documentation
80 file detection.
82 Yields:
83 Three-tuple of (path to the `Cargo.toml` file inside unpacked sources,
84 list of detected license files, and list of detected documentation files).
86 """
87 with tempfile.TemporaryDirectory() as tmpdir:
88 target_dir = f"{tmpdir}/"
90 with tarfile.open(crate_path, "r") as archive:
91 for n in archive.getnames():
93 if not str((Path(target_dir) / n).resolve()).startswith(target_dir): 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 msg = "Crate tar archive contains unsafe filenames"
95 raise UnsafeTarballError(msg)
97 # S202: the loop above checks that all filenames are safe
98 archive.extractall(target_dir) # noqa: S202
100 toml_path = Path(f"{tmpdir}/{crate_name}-{crate_version}/Cargo.toml")
101 if not toml_path.is_file(): 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 msg = "Crate does not contain a Cargo.toml file."
103 raise FileNotFoundError(msg)
105 root_path = Path(f"{tmpdir}/{crate_name}-{crate_version}")
106 license_files = get_license_files(root_path, tomlconf)
107 doc_files = get_doc_files(root_path, tomlconf)
109 yield toml_path, license_files, doc_files
112def filter_files_list(kind: str, path: Path, overrides: list[str]) -> list[str]:
113 """Override detected files with the given list of files and / or globs.
115 Arguments:
116 kind: The kind of file currently being processed (`"license"` or `"doc"`).
117 path: Root directory of the project sources.
118 overrides: List of file names or globs. Plain file names are used as-is,
119 globs are expanded inside the project root directory. If no files
120 match a specific glob, a warning is logged.
122 Returns:
123 Sorted list of file names relative to the project root directory.
125 """
126 results: set[str] = set()
128 for entry in overrides:
129 if "*" in entry or "?" in entry:
130 matches = sorted(path.glob(entry))
131 if not matches:
132 log.warn(f"No matches for specified %{kind} file glob: {entry!r}")
133 results.update(str(match.relative_to(path)) for match in matches)
134 else:
135 results.add(entry)
137 return sorted(results)
140def filter_files_in_ex(kind: str, path: Path, files: set[str], overrides: FileInEx) -> list[str]:
141 """Filter detected files given specific include and exclude settings.
143 **Includes** are processed first - plain file names are added to the list
144 of files as-is, and a warning is logged if an explicitly included file was
145 already in the set of heuristically detected files. Globs are expanded
146 inside the project root directory and any matches are added to the list.
147 If no files match a specific glob, a warning is logged.
149 **Excludes** are processed second - plain file names are removed from the
150 list of files as-is, and a warning is logged if an explicitly excluded file
151 was already not in the set of heuristically detected files. Globs are
152 expanded inside the project root directory and any matches are removed from
153 the list. If no files match a specific glob, a warning is logged.
155 It is possible for exclude settings to remove files that were explicitly
156 added with include settings when using conflicting settings.
158 Arguments:
159 kind: The kind of file currently being processed (`"license"` or `"doc"`).
160 path: Root directory of the project sources.
161 files: Heuristically determined set of files.
162 overrides: List of file names or globs for both include and exclude
163 settings.
165 Returns:
166 Sorted list of file names relative to the project root directory.
168 """
169 results = files.copy()
171 include_list: list[str] = []
172 exclude_list: list[str] = []
174 for include in overrides.include or []:
175 if "*" in include or "?" in include:
176 matches = sorted(path.glob(include))
177 if not matches:
178 log.warn(f"No matches for specified %{kind} file glob: {include!r}")
179 include_list.extend(str(match.relative_to(path)) for match in matches)
180 else:
181 if include in results:
182 log.warn(f"Manually included %{kind} file already detected: {include!r}")
183 continue
184 include_list.append(include)
186 for exclude in overrides.exclude or []:
187 if "*" in exclude or "?" in exclude:
188 matches = sorted(path.glob(exclude))
189 if not matches:
190 log.warn(f"No matches for specified %{kind} file glob: {exclude!r}")
191 exclude_list.extend(str(match.relative_to(path)) for match in matches)
192 else:
193 if exclude not in results:
194 log.warn(f"Manually excluded %{kind} file not detected: {exclude!r}")
195 continue
196 exclude_list.append(exclude)
198 for include in include_list:
199 results.add(include)
201 for exclude in exclude_list:
202 results.remove(exclude)
204 return sorted(results)
207def filter_files(kind: str, path: Path, files: set[str], overrides: list[str] | FileInEx) -> list[str]:
208 """Filter detected files given specific overrides or include and exclude settings.
210 This function just calls `filter_files_list` or `filter_files_in_ex`
211 depending on the type of the `overrides` argument.
213 Arguments:
214 kind: The kind of file currently being processed (`"license"` or `"doc"`).
215 path: Root directory of the project sources.
216 files: Heuristically determined set of files.
217 overrides: List of file names or globs for both include and exclude
218 settings, or include / exclude settings.
220 Returns:
221 Sorted list of file names relative to the project root directory.
223 """
224 if isinstance(overrides, list):
225 return filter_files_list(kind, path, overrides)
227 if isinstance(overrides, FileInEx):
228 return filter_files_in_ex(kind, path, files, overrides)
230 raise ValueError # pragma nocover
233def get_license_files(path: Path, tomlconf: TomlConf) -> list[str]:
234 """Crawl the given path for license files and apply overrides / filtering rules.
236 Arguments:
237 path: Root directory of the project sources.
238 tomlconf: Global rust2rpm configuration, which contains settings for
239 filtering / overriding heuristics for license and documentation
240 file detection.
242 Returns:
243 Sorted list of file names relative to the project root directory.
245 """
246 results: set[str] = set()
248 for root, dirs, files in os.walk(path, topdown=True):
249 dirs[:] = [d for d in dirs if d not in LICENSE_EXCLUDE_DIRS]
250 for f in files:
251 if LICENSE_FILE_PATTERN.match(f):
252 results.add(str((Path(root) / f).relative_to(path)))
254 return filter_files("license", path, results, tomlconf.package.license_files)
257def get_doc_files(path: Path, tomlconf: TomlConf) -> list[str]:
258 """Crawl the given path for documentation files and apply overrides / filtering rules.
260 Arguments:
261 path: Root directory of the project sources.
262 tomlconf: Global rust2rpm configuration, which contains settings for
263 filtering / overriding heuristics for license and documentation
264 file detection.
266 Returns:
267 Sorted list of file names relative to the project root directory.
269 """
270 results: set[str] = set()
272 for root, dirs, files in os.walk(path, topdown=True):
273 dirs[:] = []
274 for f in files:
275 if (
276 DOC_FILE_PATTERN.fullmatch(f)
277 and not LICENSE_FILE_PATTERN.fullmatch(f)
278 and not DOC_FILE_EXCLUDES.fullmatch(f)
279 ):
280 relpath = str((Path(root) / f).relative_to(path))
281 if not relpath.startswith("target/"): 281 ↛ 274line 281 didn't jump to line 274 because the condition on line 281 was always true
282 results.add(relpath)
284 return filter_files("doc", path, results, tomlconf.package.doc_files)