Coverage for torrentfile\recheck.py: 100%

278 statements  

« prev     ^ index     » next       coverage.py v7.3.0, created at 2023-08-27 21:50 -0700

1#! /usr/bin/python3 

2# -*- coding: utf-8 -*- 

3 

4############################################################################## 

5# Copyright (C) 2021-current alexpdev 

6# 

7# Licensed under the Apache License, Version 2.0 (the "License"); 

8# you may not use this file except in compliance with the License. 

9# You may obtain a copy of the License at 

10# 

11# http://www.apache.org/licenses/LICENSE-2.0 

12# 

13# Unless required by applicable law or agreed to in writing, software 

14# distributed under the License is distributed on an "AS IS" BASIS, 

15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

16# See the License for the specific language governing permissions and 

17# limitations under the License. 

18############################################################################## 

19""" 

20Module container Checker Class. 

21 

22The CheckerClass takes a torrentfile and tha path to it's contents. 

23It will then iterate through every file and directory contained 

24and compare their data to values contained within the torrent file. 

25Completion percentages will be printed to screen for each file and 

26at the end for the torrentfile as a whole. 

27""" 

28 

29import os 

30import logging 

31from hashlib import sha1, sha256 # nosec 

32from pathlib import Path 

33 

34import pyben 

35 

36from torrentfile.hasher import FileHasher 

37from torrentfile.mixins import ProgMixin 

38from torrentfile.utils import ArgumentError, MissingPathError 

39 

40SHA1 = 20 

41SHA256 = 32 

42BLOCK_SIZE = 2**14 # 16KiB 

43 

44logger = logging.getLogger(__name__) 

45 

46 

47class Checker: 

48 """ 

49 Check a given file or directory to see if it matches a torrentfile. 

50 

51 Public constructor for Checker class instance. 

52 

53 Parameters 

54 ---------- 

55 metafile : str 

56 Path to ".torrent" file. 

57 path : str 

58 Path where the content is located in filesystem. 

59 

60 Example 

61 ------- 

62 >> metafile = "/path/to/torrentfile/content_file_or_dir.torrent" 

63 >> location = "/path/to/location" 

64 >> os.path.exists("/path/to/location/content_file_or_dir") 

65 Out: True 

66 >> checker = Checker(metafile, location) 

67 """ 

68 

69 _hook = None 

70 

71 def __init__(self, metafile: str, path: str): 

72 """ 

73 Validate data against hashes contained in .torrent file. 

74 

75 Parameters 

76 ---------- 

77 metafile : str 

78 path to .torrent file 

79 path : str 

80 path to content or contents parent directory. 

81 """ 

82 if not os.path.exists(metafile): 

83 raise FileNotFoundError 

84 if os.path.isdir(metafile): 

85 raise ArgumentError( 

86 "The <metafile> must be a .torrent file. Not a directory") 

87 self.last_log = None 

88 self.log_msg("Checking: %s, %s", metafile, path) 

89 self.metafile = metafile 

90 self.total = 0 

91 self.paths = [] 

92 self.fileinfo = {} 

93 print("Extracting data from torrent file...") 

94 self.meta = pyben.load(metafile) 

95 self.info = self.meta["info"] 

96 self.name = self.info["name"] 

97 self.piece_length = self.info["piece length"] 

98 

99 if "meta version" in self.info: 

100 if "pieces" in self.info: 

101 self.meta_version = 3 

102 else: 

103 self.meta_version = 2 

104 else: 

105 self.meta_version = 1 

106 

107 self.root = self.find_root(path) 

108 self.check_paths() 

109 

110 @classmethod 

111 def register_callback(cls, hook): 

112 """ 

113 Register hooks from 3rd party programs to access generated info. 

114 

115 Parameters 

116 ---------- 

117 hook : function 

118 callback function for the logging feature. 

119 """ 

120 cls._hook = hook 

121 

122 def piece_checker(self): 

123 """ 

124 Check individual pieces of the torrent. 

125 

126 Returns 

127 ------- 

128 HashChecker | FeedChecker 

129 Individual piece hasher. 

130 """ 

131 if self.meta_version == 1: 

132 return FeedChecker 

133 return HashChecker 

134 

135 def results(self): 

136 """ 

137 Generate result percentage and store for future calls. 

138 """ 

139 responses = [] 

140 for response in self.iter_hashes(): 

141 responses.append(response) 

142 

143 self.log_msg("Final result for %s recheck: %s", self.metafile, 

144 self._result) 

145 

146 return self._result 

147 

148 def log_msg(self, *args, level: int = logging.INFO): 

149 """ 

150 Log message `msg` to logger and send `msg` to callback hook. 

151 

152 Parameters 

153 ---------- 

154 *args : dict 

155 formatting args for log message 

156 level : int 

157 Log level for this message; default=`logging.INFO` 

158 """ 

159 message = args[0] 

160 if len(args) >= 3: 

161 message = message % tuple(args[1:]) 

162 elif len(args) == 2: 

163 message = message % args[1] 

164 

165 # Repeat log messages should be ignored. 

166 if message != self.last_log: 

167 self.last_log = message 

168 logger.log(level, message) 

169 if self._hook and level == logging.INFO: 

170 self._hook(message) 

171 

172 def find_root(self, path: str) -> str: 

173 """ 

174 Check path for torrent content. 

175 

176 The path can be a relative or absolute filesystem path. In the case 

177 where the content is a single file, the path may point directly to the 

178 the file, or it may point to the parent directory. If content points 

179 to a directory. The directory will be checked to see if it matches 

180 the torrent's name, if not the directories contents will be searched. 

181 The returned value will be the absolute path that matches the torrent's 

182 name. 

183 

184 Parameters 

185 ---------- 

186 path : str 

187 root path to torrent content 

188 

189 Returns 

190 ------- 

191 str 

192 root path to content 

193 """ 

194 if not os.path.exists(path): 

195 self.log_msg("Could not locate torrent content %s.", path) 

196 raise FileNotFoundError(path) 

197 

198 root = Path(path) 

199 if root.name == self.name: 

200 self.log_msg("Content found: %s.", str(root)) 

201 return root 

202 

203 if self.name in os.listdir(root): 

204 return root / self.name 

205 

206 self.log_msg("Could not locate torrent content in: %s", str(root)) 

207 raise FileNotFoundError(root) 

208 

209 def check_paths(self): 

210 """ 

211 Gather all file paths described in the torrent file. 

212 """ 

213 finfo = self.fileinfo 

214 

215 if "length" in self.info: 

216 self.log_msg("%s points to a single file", self.root) 

217 self.total = self.info["length"] 

218 self.paths.append(str(self.root)) 

219 

220 finfo[0] = { 

221 "path": self.root, 

222 "length": self.info["length"], 

223 } 

224 

225 if self.meta_version > 1: 

226 root = self.info["file tree"][self.name][""]["pieces root"] 

227 finfo[0]["pieces root"] = root 

228 

229 return 

230 

231 # Otherwise Content is more than 1 file. 

232 self.log_msg("%s points to a directory", self.root) 

233 if self.meta_version == 1: 

234 for i, item in enumerate(self.info["files"]): 

235 self.total += item["length"] 

236 base = os.path.join(*item["path"]) 

237 

238 self.fileinfo[i] = { 

239 "path": str(self.root / base), 

240 "length": item["length"], 

241 } 

242 

243 self.paths.append(str(self.root / base)) 

244 return 

245 

246 self.walk_file_tree(self.info["file tree"], []) 

247 

248 def walk_file_tree(self, tree: dict, partials: list): 

249 """ 

250 Traverse File Tree dictionary to get file details. 

251 

252 Extract full pathnames, length, root hash, and layer hashes 

253 for each file included in the .torrent's file tree. 

254 

255 Parameters 

256 ---------- 

257 tree : dict 

258 File Tree dict extracted from torrent file. 

259 partials : list 

260 list of intermediate pathnames. 

261 """ 

262 for key, val in tree.items(): 

263 # Empty string means the tree's leaf is value 

264 if "" in val: 

265 base = os.path.join(*partials, key) 

266 roothash = None 

267 length = val[""]["length"] 

268 roothash = None if not length else val[""]["pieces root"] 

269 full = str(self.root / base) 

270 self.fileinfo[len(self.paths)] = { 

271 "path": full, 

272 "length": length, 

273 "pieces root": roothash, 

274 } 

275 self.paths.append(full) 

276 self.total += length 

277 else: 

278 self.walk_file_tree(val, partials + [key]) 

279 

280 def iter_hashes(self) -> tuple: 

281 """ 

282 Produce results of comparing torrent contents piece by piece. 

283 

284 Yields 

285 ------ 

286 chunck : bytes 

287 hash of data found on disk 

288 piece : bytes 

289 hash of data when complete and correct 

290 path : str 

291 path to file being hashed 

292 size : int 

293 length of bytes hashed for piece 

294 """ 

295 matched = consumed = 0 

296 checker = self.piece_checker() 

297 for chunk, piece, path, size in checker(self): 

298 consumed += size 

299 matching = 0 

300 if chunk == piece: 

301 matching += size 

302 matched += size 

303 yield chunk, piece, path, size 

304 total_consumed = str(int(consumed / self.total * 100)) 

305 percent_matched = str(int(matched / consumed * 100)) 

306 self.log_msg( 

307 "Processed: %s%%, Matched: %s%%", 

308 total_consumed, 

309 percent_matched, 

310 ) 

311 self._result = (matched / consumed) * 100 if consumed > 0 else 0 

312 

313 

314class FeedChecker(ProgMixin): 

315 """ 

316 Validates torrent content. 

317 

318 Seemlesly validate torrent file contents by comparing hashes in 

319 metafile against data on disk. 

320 

321 Parameters 

322 ---------- 

323 checker : object 

324 the checker class instance. 

325 """ 

326 

327 def __init__(self, checker: Checker): 

328 """ 

329 Generate hashes of piece length data from filelist contents. 

330 """ 

331 self.piece_length = checker.piece_length 

332 self.paths = checker.paths 

333 self.pieces = checker.info["pieces"] 

334 self.fileinfo = checker.fileinfo 

335 self.piece_map = {} 

336 self.index = 0 

337 self.piece_count = 0 

338 self.it = None 

339 

340 def __iter__(self): 

341 """ 

342 Assign iterator and return self. 

343 """ 

344 self.it = self.iter_pieces() 

345 return self 

346 

347 def __next__(self): 

348 """ 

349 Yield back result of comparison. 

350 """ 

351 try: 

352 partial = next(self.it) 

353 except StopIteration as itererror: 

354 raise StopIteration from itererror 

355 

356 chunck = sha1(partial).digest() # nosec 

357 start = self.piece_count * SHA1 

358 end = start + SHA1 

359 piece = self.pieces[start:end] 

360 self.piece_count += 1 

361 path = self.paths[self.index] 

362 return chunck, piece, path, len(partial) 

363 

364 def iter_pieces(self): 

365 """ 

366 Iterate through, and hash pieces of torrent contents. 

367 

368 Yields 

369 ------ 

370 piece : bytes 

371 hash digest for block of torrent data. 

372 """ 

373 partial = bytearray() 

374 for i, path in enumerate(self.paths): 

375 total = self.fileinfo[i]["length"] 

376 self.progbar = self.get_progress_tracker(total, path) 

377 self.index = i 

378 if os.path.exists(path): 

379 for piece in self.extract(path, partial): 

380 if (len(piece) == self.piece_length) or (i + 1 == len( 

381 self.paths)): 

382 yield piece 

383 else: 

384 partial = piece 

385 

386 else: 

387 length = self.fileinfo[i]["length"] 

388 for pad in self._gen_padding(partial, length): 

389 if len(pad) == self.piece_length: 

390 yield pad 

391 else: 

392 partial = pad 

393 self.progbar.close_out() 

394 

395 def extract(self, path: str, partial: bytearray) -> bytearray: 

396 """ 

397 Split file paths contents into blocks of data for hash pieces. 

398 

399 Parameters 

400 ---------- 

401 path : str 

402 path to content. 

403 partial : bytes 

404 any remaining content from last file. 

405 

406 Returns 

407 ------- 

408 bytearray 

409 Hash digest for block of .torrent contents. 

410 """ 

411 read = 0 

412 length = self.fileinfo[self.index]["length"] 

413 partial = bytearray() if len(partial) == self.piece_length else partial 

414 if path not in self.paths: # pragma: no cover 

415 raise MissingPathError(path) 

416 with open(path, "rb") as current: 

417 while True: 

418 bitlength = self.piece_length - len(partial) 

419 part = bytearray(bitlength) 

420 amount = current.readinto(part) 

421 read += amount 

422 partial.extend(part[:amount]) 

423 if amount < bitlength: 

424 if amount > 0 and read == length: 

425 self.progbar.update(amount) 

426 yield partial 

427 break 

428 self.progbar.update(amount) 

429 yield partial 

430 partial = bytearray(0) 

431 if length != read: 

432 for pad in self._gen_padding(partial, length, read): 

433 yield pad 

434 

435 def _gen_padding(self, partial: bytes, length: int, read=0) -> bytes: 

436 """ 

437 Create padded pieces where file sizes do not match. 

438 

439 Parameters 

440 ---------- 

441 partial : bytes 

442 any remaining data from last file processed. 

443 length : int 

444 size of space that needs padding 

445 read : int 

446 portion of length already padded 

447 

448 Yields 

449 ------ 

450 bytes 

451 A piece length sized block of zeros. 

452 """ 

453 while read < length: 

454 left = self.piece_length - len(partial) 

455 if length - read > left: 

456 padding = bytearray(left) 

457 partial.extend(padding) 

458 yield partial 

459 read += left 

460 partial = bytearray(0) 

461 else: 

462 partial.extend(bytearray(length - read)) 

463 read = length 

464 yield partial 

465 

466 

467class HashChecker(ProgMixin): 

468 """ 

469 Iterate through contents of meta data and verify with file contents. 

470 

471 Parameters 

472 ---------- 

473 checker : Checker 

474 the checker instance that maintains variables. 

475 """ 

476 

477 def __init__(self, checker: Checker): 

478 """ 

479 Construct a HybridChecker instance. 

480 """ 

481 self.checker = checker 

482 self.paths = checker.paths 

483 self.piece_length = checker.piece_length 

484 self.fileinfo = checker.fileinfo 

485 self.piece_layers = checker.meta["piece layers"] 

486 self.current = None 

487 self.index = -1 

488 

489 def __iter__(self): 

490 """ 

491 Assign iterator and return self. 

492 """ 

493 return self 

494 

495 def __next__(self): 

496 """ 

497 Provide the result of comparison. 

498 """ 

499 if self.current is None: 

500 self.next_file() 

501 try: 

502 return self.process_current() 

503 except StopIteration as itererr: 

504 if self.next_file(): 

505 return self.process_current() 

506 raise StopIteration from itererr 

507 

508 class Padder: 

509 """ 

510 Padding class to generate padding hashes wherever needed. 

511 

512 Parameters 

513 ---------- 

514 length: int 

515 the total size of the mock file generating padding for. 

516 piece_length : int 

517 the block size that each hash represents. 

518 """ 

519 

520 def __init__(self, length, piece_length): 

521 """ 

522 Construct padding class to Mock missing or incomplete files. 

523 

524 Parameters 

525 ---------- 

526 length : int 

527 size of the file 

528 piece_length : int 

529 the piece length for each iteration. 

530 """ 

531 self.length = length 

532 self.piece_length = piece_length 

533 self.pad = sha256(bytearray(piece_length)).digest() 

534 

535 def __iter__(self): 

536 """ 

537 Return self to correctly implement iterator type. 

538 """ 

539 return self # pragma: nocover 

540 

541 def __next__(self) -> bytes: 

542 """ 

543 Iterate through seemingly endless sha256 hashes of zeros. 

544 

545 Returns 

546 ------- 

547 tuple : 

548 returns the padding 

549 

550 Raises 

551 ------ 

552 StopIteration 

553 """ 

554 if self.length >= self.piece_length: 

555 self.length -= self.piece_length 

556 return self.pad 

557 if self.length > 0: 

558 pad = sha256(bytearray(self.length)).digest() 

559 self.length -= self.length 

560 return pad 

561 raise StopIteration 

562 

563 def next_file(self) -> bool: 

564 """ 

565 Remove all references to processed files and prepare for the next. 

566 

567 Returns 

568 ------- 

569 bool 

570 if there is a next file found 

571 """ 

572 self.index += 1 

573 if self.current is None or self.index < len(self.paths): 

574 self.current = self.paths[self.index] 

575 self.length = self.fileinfo[self.index]["length"] 

576 self.root_hash = self.fileinfo[self.index]["pieces root"] 

577 if self.length > self.piece_length: 

578 self.pieces = self.piece_layers[self.root_hash] 

579 else: 

580 self.pieces = self.root_hash 

581 path = self.paths[self.index] 

582 self.progbar = self.get_progress_tracker(self.length, path) 

583 self.count = 0 

584 if os.path.exists(self.current): 

585 self.hasher = FileHasher( 

586 path, 

587 self.piece_length, 

588 progress=2, 

589 progress_bar=self.progbar, 

590 ) 

591 else: 

592 self.hasher = self.Padder(self.length, self.piece_length) 

593 return True 

594 if self.index >= len(self.paths): 

595 del self.current 

596 del self.length 

597 del self.root_hash 

598 del self.pieces 

599 return False 

600 

601 def process_current(self) -> tuple: 

602 """ 

603 Gather necessary information to compare to metafile details. 

604 

605 Returns 

606 ------- 

607 tuple 

608 a tuple containing the layer, piece, current path and size 

609 

610 Raises 

611 ------ 

612 StopIteration 

613 """ 

614 try: 

615 layer = next(self.hasher) 

616 piece, size = self.advance() 

617 self.progbar.update(size) 

618 return layer, piece, self.current, size 

619 except StopIteration as err: 

620 if self.length > 0 and self.count * SHA256 < len(self.pieces): 

621 self.hasher = self.Padder(self.length, self.piece_length) 

622 piece, size = self.advance() 

623 layer = next(self.hasher) 

624 self.progbar.update(0) 

625 return layer, piece, self.current, size 

626 raise StopIteration from err 

627 

628 def advance(self) -> tuple: 

629 """ 

630 Increment the number of pieces processed for the current file. 

631 

632 Returns 

633 ------- 

634 tuple 

635 the piece and size 

636 """ 

637 start = self.count * SHA256 

638 end = start + SHA256 

639 piece = self.pieces[start:end] 

640 self.count += 1 

641 if self.length >= self.piece_length: 

642 self.length -= self.piece_length 

643 size = self.piece_length 

644 else: 

645 size = self.length 

646 self.length -= self.length 

647 return piece, size