#!/usr/bin/env python3 import os import sys import zipfile import xml.etree.ElementTree as ET import re import argparse from typing import List, Dict, Optional # --- Configuration & Constants --- NS = {'opf': 'http://www.idpf.org/2007/opf', 'dc': 'http://purl.org/dc/elements/1.1/'} # Register namespaces to make finding elements easier if needed, though usually we strictly rely on our map for prefix, uri in NS.items(): ET.register_namespace(prefix, uri) class VerificationResult: def __init__(self, filename: str, rel_path: str): self.filename = filename self.rel_path = rel_path self.errors: List[str] = [] def add_error(self, error: str): self.errors.append(error) def is_valid(self) -> bool: return len(self.errors) == 0 def get_opf_path(epub_zip: zipfile.ZipFile) -> Optional[str]: try: with epub_zip.open('META-INF/container.xml') as container_file: tree = ET.parse(container_file) root = tree.getroot() # Namespace for container is usually urn:oasis:names:tc:opendocument:xmlns:container # But we can just search for rootfile for rootfile in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile"): return rootfile.get('full-path') except Exception: return None return None def validate_epub(filepath: str, root_dir: str) -> VerificationResult: rel_path = os.path.relpath(filepath, root_dir) filename = os.path.basename(filepath) result = VerificationResult(filename, rel_path) try: if not zipfile.is_zipfile(filepath): result.add_error("Fichier corrompu ou non valide (pas un zip)") return result with zipfile.ZipFile(filepath, 'r') as zf: opf_path = get_opf_path(zf) if not opf_path: result.add_error("Fichier OPF introuvable (structure invalide)") return result # Check if OPF exists in zip if opf_path not in zf.namelist(): result.add_error(f"Fichier OPF déclaré mais absent: {opf_path}") return result with zf.open(opf_path) as opf_file: try: tree = ET.parse(opf_file) root = tree.getroot() except ET.ParseError: result.add_error("Erreur de parsing XML du fichier OPF") return result # Extract Metadata metadata = root.find(f".//{{http://www.idpf.org/2007/opf}}metadata") if metadata is None: # Fallback for older versions or if namespace is missing/different extraction needed # Try finding metadata without namespace if strict fails or check version? # Actually standard requires namespace. Let's try flexible search if strict fails. metadata = root.find("metadata") if metadata is None: # Try with search pass # Helper to get text content safely def get_text(elem_name: str, strict_ns=True) -> Optional[str]: if metadata is None: return None # DC elements elem = metadata.find(f"dc:{elem_name}", NS) if elem is not None and elem.text: return elem.text.strip() return None def get_all_text(elem_name: str) -> List[str]: if metadata is None: return [] return [e.text.strip() for e in metadata.findall(f"dc:{elem_name}", NS) if e.text] # 1. Title title = get_text('title') if not title: result.add_error("Titre vide") else: if '_' in title: result.add_error("Titre sale (Underscores)") if title.lower().endswith('.epub'): result.add_error("Titre sale (Extension .epub)") if title.isdigit(): result.add_error("Titre sale (Numérique)") # 2. Author authors = get_all_text('creator') if not authors: result.add_error("Auteur vide ou 'Unknown'") else: for author in authors: if not author or author.lower() == 'unknown': result.add_error(f"Auteur invalide ('{author}')") if ',' in author: result.add_error(f"Auteur format invalide (Contient virgule): {author}") if 'calibre' in author.lower(): result.add_error(f"Auteur sale (Contient 'Calibre'): {author}") # 3. Year/Date date = get_text('date') if not date: result.add_error("Date vide") else: # Extract year match = re.search(r'(\d{4})', date) if not match: result.add_error(f"Date invalide (pas 4 chiffres): {date}") else: year = int(match.group(1)) import datetime current_year = datetime.datetime.now().year if year > current_year + 1: # Allow +1 for upcoming result.add_error(f"Année future: {year}") # 4. Publisher publisher = get_text('publisher') if not publisher: result.add_error("Éditeur manquant") else: if re.search(r'https?://', publisher) or re.search(r'www\.', publisher): result.add_error("Éditeur sale (URL détectée)") if publisher.lower() in ['unknown', 'smashwords']: result.add_error(f"Éditeur générique ({publisher})") # 5. Identifier (ISBN/URN) identifiers = metadata.findall(f"dc:identifier", NS) if metadata else [] valid_id_found = False has_calibre_uuid = False for ident in identifiers: text = ident.text.strip() if ident.text else "" scheme = ident.get(f"{{http://www.idpf.org/2007/opf}}scheme", "").lower() # Also check without namespace if needed or attributes? # Sometimes scheme is not namespaced in attributes depending on version? No, OPF says attributes are not namespaced usually unless distinct? # Actually opf:scheme is deprecated in 2.0, standard is distinct. # Let's just check the text content for ISBN-like or urn patterns. if not text: continue if 'calibre' in text.lower() or 'uuid' in text.lower(): # Check if it is a lustreless UUID from calibre pass # Check for ISBN clean_id = text.replace('-', '').replace(' ', '') if text.lower().startswith('urn:'): if 'uuid' not in text.lower(): # Basic URN valid_id_found = True # If it is urn:uuid: it is valid unless it is ONLY that? # The rule: "Si uniquement un UUID Calibre est présent -> Erreur." # UUIDs are standard. But maybe user means *Calibre specific* uuid scheme? # Often Calibre adds ... pass if scheme == 'calibre': has_calibre_uuid = True continue # Don't count as valid ID for the "Only Calibre" rule check elif scheme == 'isbn': valid_id_found = True else: # Heuristics if re.match(r'^\d{9}[\d|X]$', clean_id) or re.match(r'^\d{13}$', clean_id): valid_id_found = True elif text.lower().startswith('urn:isbn'): valid_id_found = True elif text.lower().startswith('urn:uuid'): # We treat generic UUIDs as valid unless we are strict about "No Calibre UUID ONLY". # The rule: "Vide, ou ne contient aucun identifiant valide (ni ISBN, ni un URN standard). Si uniquement un UUID Calibre est présent -> Erreur." # Usually a book has a UUID. # I will assume "UUID Calibre" means an identifier with scheme="calibre" or text that looks like a calibre uuid if it's the ONLY one. valid_id_found = True if not valid_id_found and not identifiers: result.add_error("Identifiant vide") elif not valid_id_found and identifiers: # Check if we only had calibre # If we are here, we saw identifiers but none marked valid. # If all were skipped because scheme='calibre', then error. result.add_error("Aucun identifiant valide (Uniquement Calibre/Inconnu)") # 6. Language lang = get_text('language') # dc:language if not lang: result.add_error("Langue vide") else: # Check valid code (2 or 3 letters) if not re.match(r'^[a-zA-Z]{2,3}(-[a-zA-Z]+)?$', lang): result.add_error(f"Code langue invalide: {lang}") # 7. Cover # Strategy: # Look for # Then look for in cover_meta = None if metadata: # for meta in metadata.findall(f"{{http://www.idpf.org/2007/opf}}meta"): # opf:meta not usually in 2.0? pass # The 'name' attribute is not in a namespace usually for OPF 2.0 meta elements? # Actually in OPF 2.0: for meta in metadata.findall("meta"): # Searching without namespace if meta.get('name') == 'cover': cover_meta = meta.get('content') break # Also check namespaced meta if parsing strict OPF 3.0? # element is in 3.0? # User asked for "cover" meta specifically or general cover check. manifest = root.find(f"{{http://www.idpf.org/2007/opf}}manifest") if not manifest: # Try without namespace manifest = root.find("manifest") cover_found = False if cover_meta and manifest: # Check if item exists # search item with id=cover_meta for item in manifest.findall(f"{{http://www.idpf.org/2007/opf}}item"): if item.get('id') == cover_meta: cover_found = True break if not cover_found: # Try without namespace for item in manifest.findall("item"): if item.get('id') == cover_meta: cover_found = True break # Alternative: check for item with properties="cover-image" (EPUB 3) if not cover_found and manifest: for item in manifest.findall(f"{{http://www.idpf.org/2007/opf}}item"): props = item.get('properties', '') if 'cover-image' in props.split(): cover_found = True break if not cover_found: result.add_error("Couverture absente (Manifest ou Meta manquant)") except Exception as e: result.add_error(f"Erreur technique: {str(e)}") return result def main(): parser = argparse.ArgumentParser(description='Audit EPUB Metadata') parser.add_argument('directory', help='Root directory to scan') args = parser.parse_args() root_dir = os.path.abspath(args.directory) if not os.path.exists(root_dir): print(f"Erreur: Le dossier '{root_dir}' n'existe pas.") sys.exit(1) print("📚 Audit Ebook Exhaustif") perfect_count = 0 error_count = 0 # Recursive walk for current_root, dirs, files in os.walk(root_dir): # Sort for consistent output? files.sort() for filename in files: if filename.lower().endswith('.epub'): filepath = os.path.join(current_root, filename) res = validate_epub(filepath, root_dir) if res.is_valid(): perfect_count += 1 # Usually we verify output requests... # "Le script doit générer exactement ce visuel (arborescence ASCII) :" # And implies we list files with errors? else: error_count += 1 print(f"📖 {res.filename}") print(f" 🔗 {res.rel_path}") for err in res.errors: # Determine icon based on error text? icon = "❌" if "manquant" in err.lower() or "absent" in err.lower(): if "éditeur" in err.lower(): # specifically warning for editor? User prompt: "⚠️ Éditeur manquant" icon = "⚠️" else: icon = "❌" # Cover "Couverture absente" is ❌ in user example elif "sale" in err.lower(): icon = "❌" print(f" └ {icon} {err}") print("...") # As per user request example? Or just separating? print(f"✅ [{perfect_count}] Livres parfaits / [{error_count}] Livres avec erreurs") if __name__ == '__main__': main()