Coverage for rust2rpm/inspect.py: 95%

99 statements  

« prev     ^ index     » next       coverage.py v7.6.7, created at 2024-11-26 13:52 +0100

1"""Module containing functionality for inspecting contents of crate files / tarballs.""" 

2 

3import contextlib 

4import os 

5import re 

6import tarfile 

7import tempfile 

8from collections.abc import Generator 

9from pathlib import Path 

10from typing import Any 

11 

12from rust2rpm import log 

13from rust2rpm.conf import FileInEx, TomlConf 

14 

15LICENSE_FILE_PATTERN = re.compile( 

16 r""" 

17 COPYING(?:[.-].*)?|COPYRIGHT(?:[.-].*)?| 

18 EULA(?:[.-].*)?|[Ll]icen[cs]e(?![.]rs$).*| 

19 (?:.*[.-])?(?:UN)?LICEN[CS]E(?:[.-].*)?|NOTICE(?:[.-].*)?| 

20 PATENTS(?:[.-].*)?| 

21 (?:agpl|l?gpl)[.-].*|CC-BY-.*| 

22 (?:AGPL|APACHE|BSD|GFDL|GNU|L?GPL|MIT|MPL|OFL)-.*[0-9].* 

23 """, 

24 re.VERBOSE, 

25) 

26"""Regular expression for heuristically determining files that contain license 

27texts based on the file name.""" 

28 

29LICENSE_EXCLUDE_DIRS = { 

30 "target", 

31 "vendor", 

32 "example", 

33 "examples", 

34 "_example", 

35 "_examples", 

36 "testdata", 

37 "_testdata", 

38 ".github", 

39 "tests", 

40 "test", 

41} 

42"""List of directories that are ignored when crawling project sources for 

43license files.""" 

44 

45DOC_FILE_PATTERN = re.compile( 

46 r""" 

47 .*\.(?:md|markdown|mdown|mkdn|rst|txt)|AUTHORS| 

48 AUTHORS[.-].*|CONTRIBUTORS|CONTRIBUTORS[.-].*|README| 

49 README[.-].*|CHANGELOG|CHANGELOG[.-].*|TODO|TODO[.-].* 

50 """, 

51 re.IGNORECASE | re.VERBOSE, 

52) 

53"""Regular expression for heuristically determining documentation files based 

54on the file name.""" 

55 

56DOC_FILE_EXCLUDES = re.compile(r"CMakeLists\.txt|versions\.txt|.*\.tpl|.*\.in") 

57"""Regular expression used to ignore files that look like documentation but 

58actually are not when crawling project sources.""" 

59 

60 

61class UnsafeTarballError(Exception): 

62 """Raised if the downloaded crate file is a tarball with unsafe contents.""" 

63 

64 

65@contextlib.contextmanager 

66def files_from_crate( 

67 crate_path: Path, 

68 crate_name: str, 

69 crate_version: str, 

70 tomlconf: TomlConf, 

71) -> Generator[tuple[Path, list[str], list[str]], Any, None]: 

72 """Unpack the crate at the given path and inspect its contents. 

73 

74 Arguments: 

75 crate_path: Path to the crate file / tarball. 

76 crate_name: Name of the crate. 

77 crate_version: Version of the crate. 

78 tomlconf: Global rust2rpm configuration, which contains settings for 

79 filtering / overriding heuristics for license and documentation 

80 file detection. 

81 

82 Yields: 

83 Three-tuple of (path to the `Cargo.toml` file inside unpacked sources, 

84 list of detected license files, and list of detected documentation files). 

85 

86 """ 

87 with tempfile.TemporaryDirectory() as tmpdir: 

88 target_dir = f"{tmpdir}/" 

89 

90 with tarfile.open(crate_path, "r") as archive: 

91 for n in archive.getnames(): 

92 

93 if not str((Path(target_dir) / n).resolve()).startswith(target_dir): 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 msg = "Crate tar archive contains unsafe filenames" 

95 raise UnsafeTarballError(msg) 

96 

97 # S202: the loop above checks that all filenames are safe 

98 archive.extractall(target_dir) # noqa: S202 

99 

100 toml_path = Path(f"{tmpdir}/{crate_name}-{crate_version}/Cargo.toml") 

101 if not toml_path.is_file(): 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 msg = "Crate does not contain a Cargo.toml file." 

103 raise FileNotFoundError(msg) 

104 

105 root_path = Path(f"{tmpdir}/{crate_name}-{crate_version}") 

106 license_files = get_license_files(root_path, tomlconf) 

107 doc_files = get_doc_files(root_path, tomlconf) 

108 

109 yield toml_path, license_files, doc_files 

110 

111 

112def filter_files_list(kind: str, path: Path, overrides: list[str]) -> list[str]: 

113 """Override detected files with the given list of files and / or globs. 

114 

115 Arguments: 

116 kind: The kind of file currently being processed (`"license"` or `"doc"`). 

117 path: Root directory of the project sources. 

118 overrides: List of file names or globs. Plain file names are used as-is, 

119 globs are expanded inside the project root directory. If no files 

120 match a specific glob, a warning is logged. 

121 

122 Returns: 

123 Sorted list of file names relative to the project root directory. 

124 

125 """ 

126 results: set[str] = set() 

127 

128 for entry in overrides: 

129 if "*" in entry or "?" in entry: 

130 matches = sorted(path.glob(entry)) 

131 if not matches: 

132 log.warn(f"No matches for specified %{kind} file glob: {entry!r}") 

133 results.update(str(match.relative_to(path)) for match in matches) 

134 else: 

135 results.add(entry) 

136 

137 return sorted(results) 

138 

139 

140def filter_files_in_ex(kind: str, path: Path, files: set[str], overrides: FileInEx) -> list[str]: 

141 """Filter detected files given specific include and exclude settings. 

142 

143 **Includes** are processed first - plain file names are added to the list 

144 of files as-is, and a warning is logged if an explicitly included file was 

145 already in the set of heuristically detected files. Globs are expanded 

146 inside the project root directory and any matches are added to the list. 

147 If no files match a specific glob, a warning is logged. 

148 

149 **Excludes** are processed second - plain file names are removed from the 

150 list of files as-is, and a warning is logged if an explicitly excluded file 

151 was already not in the set of heuristically detected files. Globs are 

152 expanded inside the project root directory and any matches are removed from 

153 the list. If no files match a specific glob, a warning is logged. 

154 

155 It is possible for exclude settings to remove files that were explicitly 

156 added with include settings when using conflicting settings. 

157 

158 Arguments: 

159 kind: The kind of file currently being processed (`"license"` or `"doc"`). 

160 path: Root directory of the project sources. 

161 files: Heuristically determined set of files. 

162 overrides: List of file names or globs for both include and exclude 

163 settings. 

164 

165 Returns: 

166 Sorted list of file names relative to the project root directory. 

167 

168 """ 

169 results = files.copy() 

170 

171 include_list: list[str] = [] 

172 exclude_list: list[str] = [] 

173 

174 for include in overrides.include or []: 

175 if "*" in include or "?" in include: 

176 matches = sorted(path.glob(include)) 

177 if not matches: 

178 log.warn(f"No matches for specified %{kind} file glob: {include!r}") 

179 include_list.extend(str(match.relative_to(path)) for match in matches) 

180 else: 

181 if include in results: 

182 log.warn(f"Manually included %{kind} file already detected: {include!r}") 

183 continue 

184 include_list.append(include) 

185 

186 for exclude in overrides.exclude or []: 

187 if "*" in exclude or "?" in exclude: 

188 matches = sorted(path.glob(exclude)) 

189 if not matches: 

190 log.warn(f"No matches for specified %{kind} file glob: {exclude!r}") 

191 exclude_list.extend(str(match.relative_to(path)) for match in matches) 

192 else: 

193 if exclude not in results: 

194 log.warn(f"Manually excluded %{kind} file not detected: {exclude!r}") 

195 continue 

196 exclude_list.append(exclude) 

197 

198 for include in include_list: 

199 results.add(include) 

200 

201 for exclude in exclude_list: 

202 results.remove(exclude) 

203 

204 return sorted(results) 

205 

206 

207def filter_files(kind: str, path: Path, files: set[str], overrides: list[str] | FileInEx) -> list[str]: 

208 """Filter detected files given specific overrides or include and exclude settings. 

209 

210 This function just calls `filter_files_list` or `filter_files_in_ex` 

211 depending on the type of the `overrides` argument. 

212 

213 Arguments: 

214 kind: The kind of file currently being processed (`"license"` or `"doc"`). 

215 path: Root directory of the project sources. 

216 files: Heuristically determined set of files. 

217 overrides: List of file names or globs for both include and exclude 

218 settings, or include / exclude settings. 

219 

220 Returns: 

221 Sorted list of file names relative to the project root directory. 

222 

223 """ 

224 if isinstance(overrides, list): 

225 return filter_files_list(kind, path, overrides) 

226 

227 if isinstance(overrides, FileInEx): 

228 return filter_files_in_ex(kind, path, files, overrides) 

229 

230 raise ValueError # pragma nocover 

231 

232 

233def get_license_files(path: Path, tomlconf: TomlConf) -> list[str]: 

234 """Crawl the given path for license files and apply overrides / filtering rules. 

235 

236 Arguments: 

237 path: Root directory of the project sources. 

238 tomlconf: Global rust2rpm configuration, which contains settings for 

239 filtering / overriding heuristics for license and documentation 

240 file detection. 

241 

242 Returns: 

243 Sorted list of file names relative to the project root directory. 

244 

245 """ 

246 results: set[str] = set() 

247 

248 for root, dirs, files in os.walk(path, topdown=True): 

249 dirs[:] = [d for d in dirs if d not in LICENSE_EXCLUDE_DIRS] 

250 for f in files: 

251 if LICENSE_FILE_PATTERN.match(f): 

252 results.add(str((Path(root) / f).relative_to(path))) 

253 

254 return filter_files("license", path, results, tomlconf.package.license_files) 

255 

256 

257def get_doc_files(path: Path, tomlconf: TomlConf) -> list[str]: 

258 """Crawl the given path for documentation files and apply overrides / filtering rules. 

259 

260 Arguments: 

261 path: Root directory of the project sources. 

262 tomlconf: Global rust2rpm configuration, which contains settings for 

263 filtering / overriding heuristics for license and documentation 

264 file detection. 

265 

266 Returns: 

267 Sorted list of file names relative to the project root directory. 

268 

269 """ 

270 results: set[str] = set() 

271 

272 for root, dirs, files in os.walk(path, topdown=True): 

273 dirs[:] = [] 

274 for f in files: 

275 if ( 

276 DOC_FILE_PATTERN.fullmatch(f) 

277 and not LICENSE_FILE_PATTERN.fullmatch(f) 

278 and not DOC_FILE_EXCLUDES.fullmatch(f) 

279 ): 

280 relpath = str((Path(root) / f).relative_to(path)) 

281 if not relpath.startswith("target/"): 281 ↛ 274line 281 didn't jump to line 274 because the condition on line 281 was always true

282 results.add(relpath) 

283 

284 return filter_files("doc", path, results, tomlconf.package.doc_files)