Coverage for torrentfile\recheck.py: 100%

1#! /usr/bin/python3

2# -*- coding: utf-8 -*-

4##############################################################################

7# Licensed under the Apache License, Version 2.0 (the "License");

8# you may not use this file except in compliance with the License.

9# You may obtain a copy of the License at

10#

11# http://www.apache.org/licenses/LICENSE-2.0

12#

13# Unless required by applicable law or agreed to in writing, software

14# distributed under the License is distributed on an "AS IS" BASIS,

15# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

16# See the License for the specific language governing permissions and

17# limitations under the License.

18##############################################################################

19"""

20Module container Checker Class.

22The CheckerClass takes a torrentfile and tha path to it's contents.

23It will then iterate through every file and directory contained

24and compare their data to values contained within the torrent file.

25Completion percentages will be printed to screen for each file and

26at the end for the torrentfile as a whole.

27"""

29import os

30import logging

31from hashlib import sha1, sha256 # nosec

32from pathlib import Path

34import pyben

36from torrentfile.hasher import FileHasher

37from torrentfile.mixins import ProgMixin

38from torrentfile.utils import ArgumentError, MissingPathError

40SHA1 = 20

41SHA256 = 32

42BLOCK_SIZE = 2**14 # 16KiB

44logger = logging.getLogger(__name__)

47class Checker:

48 """

49 Check a given file or directory to see if it matches a torrentfile.

51 Public constructor for Checker class instance.

53 Parameters

54 ----------

55 metafile : str

56 Path to ".torrent" file.

57 path : str

58 Path where the content is located in filesystem.

60 Example

61 -------

62 >> metafile = "/path/to/torrentfile/content_file_or_dir.torrent"

63 >> location = "/path/to/location"

64 >> os.path.exists("/path/to/location/content_file_or_dir")

65 Out: True

66 >> checker = Checker(metafile, location)

67 """

69 _hook = None

71 def __init__(self, metafile: str, path: str):

72 """

73 Validate data against hashes contained in .torrent file.

75 Parameters

76 ----------

77 metafile : str

78 path to .torrent file

79 path : str

80 path to content or contents parent directory.

81 """

82 if not os.path.exists(metafile):

83 raise FileNotFoundError

84 if os.path.isdir(metafile):

85 raise ArgumentError(

86 "The <metafile> must be a .torrent file. Not a directory")

87 self.last_log = None

88 self.log_msg("Checking: %s, %s", metafile, path)

89 self.metafile = metafile

90 self.total = 0

91 self.paths = []

92 self.fileinfo = {}

93 print("Extracting data from torrent file...")

94 self.meta = pyben.load(metafile)

95 self.info = self.meta["info"]

96 self.name = self.info["name"]

97 self.piece_length = self.info["piece length"]

99 if "meta version" in self.info:

100 if "pieces" in self.info:

101 self.meta_version = 3

102 else:

103 self.meta_version = 2

104 else:

105 self.meta_version = 1

106

107 self.root = self.find_root(path)

108 self.check_paths()

109

110 @classmethod

111 def register_callback(cls, hook):

112 """

113 Register hooks from 3rd party programs to access generated info.

114

115 Parameters

116 ----------

117 hook : function

118 callback function for the logging feature.

119 """

120 cls._hook = hook

121

122 def piece_checker(self):

123 """

124 Check individual pieces of the torrent.

125

126 Returns

127 -------

128 HashChecker | FeedChecker

129 Individual piece hasher.

130 """

131 if self.meta_version == 1:

132 return FeedChecker

133 return HashChecker

134

135 def results(self):

136 """

137 Generate result percentage and store for future calls.

138 """

139 responses = []

140 for response in self.iter_hashes():

141 responses.append(response)

142

143 self.log_msg("Final result for %s recheck: %s", self.metafile,

144 self._result)

145

146 return self._result

147

148 def log_msg(self, *args, level: int = logging.INFO):

149 """

150 Log message `msg` to logger and send `msg` to callback hook.

151

152 Parameters

153 ----------

154 *args : dict

155 formatting args for log message

156 level : int

157 Log level for this message; default=`logging.INFO`

158 """

159 message = args[0]

160 if len(args) >= 3:

161 message = message % tuple(args[1:])

162 elif len(args) == 2:

163 message = message % args[1]

164

165 # Repeat log messages should be ignored.

166 if message != self.last_log:

167 self.last_log = message

168 logger.log(level, message)

169 if self._hook and level == logging.INFO:

170 self._hook(message)

171

172 def find_root(self, path: str) -> str:

173 """

174 Check path for torrent content.

175

176 The path can be a relative or absolute filesystem path. In the case

177 where the content is a single file, the path may point directly to the

178 the file, or it may point to the parent directory. If content points

179 to a directory. The directory will be checked to see if it matches

180 the torrent's name, if not the directories contents will be searched.

181 The returned value will be the absolute path that matches the torrent's

182 name.

183

184 Parameters

185 ----------

186 path : str

187 root path to torrent content

188

189 Returns

190 -------

191 str

192 root path to content

193 """

194 if not os.path.exists(path):

195 self.log_msg("Could not locate torrent content %s.", path)

196 raise FileNotFoundError(path)

197

198 root = Path(path)

199 if root.name == self.name:

200 self.log_msg("Content found: %s.", str(root))

201 return root

202

203 if self.name in os.listdir(root):

204 return root / self.name

205

206 self.log_msg("Could not locate torrent content in: %s", str(root))

207 raise FileNotFoundError(root)

208

209 def check_paths(self):

210 """

211 Gather all file paths described in the torrent file.

212 """

213 finfo = self.fileinfo

214

215 if "length" in self.info:

216 self.log_msg("%s points to a single file", self.root)

217 self.total = self.info["length"]

218 self.paths.append(str(self.root))

219

220 finfo[0] = {

221 "path": self.root,

222 "length": self.info["length"],

223 }

224

225 if self.meta_version > 1:

226 root = self.info["file tree"][self.name][""]["pieces root"]

227 finfo[0]["pieces root"] = root

228

229 return

230

231 # Otherwise Content is more than 1 file.

232 self.log_msg("%s points to a directory", self.root)

233 if self.meta_version == 1:

234 for i, item in enumerate(self.info["files"]):

235 self.total += item["length"]

236 base = os.path.join(*item["path"])

237

238 self.fileinfo[i] = {

239 "path": str(self.root / base),

240 "length": item["length"],

241 }

242

243 self.paths.append(str(self.root / base))

244 return

245

246 self.walk_file_tree(self.info["file tree"], [])

247

248 def walk_file_tree(self, tree: dict, partials: list):

249 """

250 Traverse File Tree dictionary to get file details.

251

252 Extract full pathnames, length, root hash, and layer hashes

253 for each file included in the .torrent's file tree.

254

255 Parameters

256 ----------

257 tree : dict

258 File Tree dict extracted from torrent file.

259 partials : list

260 list of intermediate pathnames.

261 """

262 for key, val in tree.items():

263 # Empty string means the tree's leaf is value

264 if "" in val:

265 base = os.path.join(*partials, key)

266 roothash = None

267 length = val[""]["length"]

268 roothash = None if not length else val[""]["pieces root"]

269 full = str(self.root / base)

270 self.fileinfo[len(self.paths)] = {

271 "path": full,

272 "length": length,

273 "pieces root": roothash,

274 }

275 self.paths.append(full)

276 self.total += length

277 else:

278 self.walk_file_tree(val, partials + [key])

279

280 def iter_hashes(self) -> tuple:

281 """

282 Produce results of comparing torrent contents piece by piece.

283

284 Yields

285 ------

286 chunck : bytes

287 hash of data found on disk

288 piece : bytes

289 hash of data when complete and correct

290 path : str

291 path to file being hashed

292 size : int

293 length of bytes hashed for piece

294 """

295 matched = consumed = 0

296 checker = self.piece_checker()

297 for chunk, piece, path, size in checker(self):

298 consumed += size

299 matching = 0

300 if chunk == piece:

301 matching += size

302 matched += size

303 yield chunk, piece, path, size

304 total_consumed = str(int(consumed / self.total * 100))

305 percent_matched = str(int(matched / consumed * 100))

306 self.log_msg(

307 "Processed: %s%%, Matched: %s%%",

308 total_consumed,

309 percent_matched,

310 )

311 self._result = (matched / consumed) * 100 if consumed > 0 else 0

312

313

314class FeedChecker(ProgMixin):

315 """

316 Validates torrent content.

317

318 Seemlesly validate torrent file contents by comparing hashes in

319 metafile against data on disk.

320

321 Parameters

322 ----------

323 checker : object

324 the checker class instance.

325 """

326

327 def __init__(self, checker: Checker):

328 """

329 Generate hashes of piece length data from filelist contents.

330 """

331 self.piece_length = checker.piece_length

332 self.paths = checker.paths

333 self.pieces = checker.info["pieces"]

334 self.fileinfo = checker.fileinfo

335 self.piece_map = {}

336 self.index = 0

337 self.piece_count = 0

338 self.it = None

339

340 def __iter__(self):

341 """

342 Assign iterator and return self.

343 """

344 self.it = self.iter_pieces()

345 return self

346

347 def __next__(self):

348 """

349 Yield back result of comparison.

350 """

351 try:

352 partial = next(self.it)

353 except StopIteration as itererror:

354 raise StopIteration from itererror

355

356 chunck = sha1(partial).digest() # nosec

357 start = self.piece_count * SHA1

358 end = start + SHA1

359 piece = self.pieces[start:end]

360 self.piece_count += 1

361 path = self.paths[self.index]

362 return chunck, piece, path, len(partial)

363

364 def iter_pieces(self):

365 """

366 Iterate through, and hash pieces of torrent contents.

367

368 Yields

369 ------

370 piece : bytes

371 hash digest for block of torrent data.

372 """

373 partial = bytearray()

374 for i, path in enumerate(self.paths):

375 total = self.fileinfo[i]["length"]

376 self.progbar = self.get_progress_tracker(total, path)

377 self.index = i

378 if os.path.exists(path):

379 for piece in self.extract(path, partial):

380 if (len(piece) == self.piece_length) or (i + 1 == len(

381 self.paths)):

382 yield piece

383 else:

384 partial = piece

385

386 else:

387 length = self.fileinfo[i]["length"]

388 for pad in self._gen_padding(partial, length):

389 if len(pad) == self.piece_length:

390 yield pad

391 else:

392 partial = pad

393 self.progbar.close_out()

394

395 def extract(self, path: str, partial: bytearray) -> bytearray:

396 """

397 Split file paths contents into blocks of data for hash pieces.

398

399 Parameters

400 ----------

401 path : str

402 path to content.

403 partial : bytes

404 any remaining content from last file.

405

406 Returns

407 -------

408 bytearray

409 Hash digest for block of .torrent contents.

410 """

411 read = 0

412 length = self.fileinfo[self.index]["length"]

413 partial = bytearray() if len(partial) == self.piece_length else partial

414 if path not in self.paths: # pragma: no cover

415 raise MissingPathError(path)

416 with open(path, "rb") as current:

417 while True:

418 bitlength = self.piece_length - len(partial)

419 part = bytearray(bitlength)

420 amount = current.readinto(part)

421 read += amount

422 partial.extend(part[:amount])

423 if amount < bitlength:

424 if amount > 0 and read == length:

425 self.progbar.update(amount)

426 yield partial

427 break

428 self.progbar.update(amount)

429 yield partial

430 partial = bytearray(0)

431 if length != read:

432 for pad in self._gen_padding(partial, length, read):

433 yield pad

434

435 def _gen_padding(self, partial: bytes, length: int, read=0) -> bytes:

436 """

437 Create padded pieces where file sizes do not match.

438

439 Parameters

440 ----------

441 partial : bytes

442 any remaining data from last file processed.

443 length : int

444 size of space that needs padding

445 read : int

446 portion of length already padded

447

448 Yields

449 ------

450 bytes

451 A piece length sized block of zeros.

452 """

453 while read < length:

454 left = self.piece_length - len(partial)

455 if length - read > left:

456 padding = bytearray(left)

457 partial.extend(padding)

458 yield partial

459 read += left

460 partial = bytearray(0)

461 else:

462 partial.extend(bytearray(length - read))

463 read = length

464 yield partial

465

466

467class HashChecker(ProgMixin):

468 """

469 Iterate through contents of meta data and verify with file contents.

470

471 Parameters

472 ----------

473 checker : Checker

474 the checker instance that maintains variables.

475 """

476

477 def __init__(self, checker: Checker):

478 """

479 Construct a HybridChecker instance.

480 """

481 self.checker = checker

482 self.paths = checker.paths

483 self.piece_length = checker.piece_length

484 self.fileinfo = checker.fileinfo

485 self.piece_layers = checker.meta["piece layers"]

486 self.current = None

487 self.index = -1

488

489 def __iter__(self):

490 """

491 Assign iterator and return self.

492 """

493 return self

494

495 def __next__(self):

496 """

497 Provide the result of comparison.

498 """

499 if self.current is None:

500 self.next_file()

501 try:

502 return self.process_current()

503 except StopIteration as itererr:

504 if self.next_file():

505 return self.process_current()

506 raise StopIteration from itererr

507

508 class Padder:

509 """

510 Padding class to generate padding hashes wherever needed.

511

512 Parameters

513 ----------

514 length: int

515 the total size of the mock file generating padding for.

516 piece_length : int

517 the block size that each hash represents.

518 """

519

520 def __init__(self, length, piece_length):

521 """

522 Construct padding class to Mock missing or incomplete files.

523

524 Parameters

525 ----------

526 length : int

527 size of the file

528 piece_length : int

529 the piece length for each iteration.

530 """

531 self.length = length

532 self.piece_length = piece_length

533 self.pad = sha256(bytearray(piece_length)).digest()

534

535 def __iter__(self):

536 """

537 Return self to correctly implement iterator type.

538 """

539 return self # pragma: nocover

540

541 def __next__(self) -> bytes:

542 """

543 Iterate through seemingly endless sha256 hashes of zeros.

544

545 Returns

546 -------

547 tuple :

548 returns the padding

549

550 Raises

551 ------

552 StopIteration

553 """

554 if self.length >= self.piece_length:

555 self.length -= self.piece_length

556 return self.pad

557 if self.length > 0:

558 pad = sha256(bytearray(self.length)).digest()

559 self.length -= self.length

560 return pad

561 raise StopIteration

562

563 def next_file(self) -> bool:

564 """

565 Remove all references to processed files and prepare for the next.

566

567 Returns

568 -------

569 bool

570 if there is a next file found

571 """

572 self.index += 1

573 if self.current is None or self.index < len(self.paths):

574 self.current = self.paths[self.index]

575 self.length = self.fileinfo[self.index]["length"]

576 self.root_hash = self.fileinfo[self.index]["pieces root"]

577 if self.length > self.piece_length:

578 self.pieces = self.piece_layers[self.root_hash]

579 else:

580 self.pieces = self.root_hash

581 path = self.paths[self.index]

582 self.progbar = self.get_progress_tracker(self.length, path)

583 self.count = 0

584 if os.path.exists(self.current):

585 self.hasher = FileHasher(

586 path,

587 self.piece_length,

588 progress=2,

589 progress_bar=self.progbar,

590 )

591 else:

592 self.hasher = self.Padder(self.length, self.piece_length)

593 return True

594 if self.index >= len(self.paths):

595 del self.current

596 del self.length

597 del self.root_hash

598 del self.pieces

599 return False

600

601 def process_current(self) -> tuple:

602 """

603 Gather necessary information to compare to metafile details.

604

605 Returns

606 -------

607 tuple

608 a tuple containing the layer, piece, current path and size

609

610 Raises

611 ------

612 StopIteration

613 """

614 try:

615 layer = next(self.hasher)

616 piece, size = self.advance()

617 self.progbar.update(size)

618 return layer, piece, self.current, size

619 except StopIteration as err:

620 if self.length > 0 and self.count * SHA256 < len(self.pieces):

621 self.hasher = self.Padder(self.length, self.piece_length)

622 piece, size = self.advance()

623 layer = next(self.hasher)

624 self.progbar.update(0)

625 return layer, piece, self.current, size

626 raise StopIteration from err

627

628 def advance(self) -> tuple:

629 """

630 Increment the number of pieces processed for the current file.

631

632 Returns

633 -------

634 tuple

635 the piece and size

636 """

637 start = self.count * SHA256

638 end = start + SHA256

639 piece = self.pieces[start:end]

640 self.count += 1

641 if self.length >= self.piece_length:

642 self.length -= self.piece_length

643 size = self.piece_length

644 else:

645 size = self.length

646 self.length -= self.length

647 return piece, size