diff --git a/audit_epubs.py b/audit_epubs.py index 952635b..0fee9ce 100644 --- a/audit_epubs.py +++ b/audit_epubs.py @@ -100,7 +100,8 @@ def validate_epub(filepath: str, root_dir: str) -> VerificationResult: result.add_error("Titre sale (Underscores)") if title.lower().endswith('.epub'): result.add_error("Titre sale (Extension .epub)") - if title.isdigit(): + if title.isdigit() and len(title) >= 5: + # Allow short numeric titles like "1984" result.add_error("Titre sale (Numérique)") # 2. Author @@ -177,12 +178,22 @@ def validate_epub(filepath: str, root_dir: str) -> VerificationResult: continue # Don't count as valid ID for the "Only Calibre" rule check elif scheme == 'isbn': valid_id_found = True + elif scheme in ('asin', 'mobi-asin'): + # Amazon identifiers + valid_id_found = True else: # Heuristics - if re.match(r'^\d{9}[\d|X]$', clean_id) or re.match(r'^\d{13}$', clean_id): + # Check for ISBN in various formats + if re.match(r'^\\d{9}[\\d|X]$', clean_id) or re.match(r'^\\d{13}$', clean_id): valid_id_found = True elif text.lower().startswith('urn:isbn'): valid_id_found = True + elif text.lower().startswith('isbn:'): + # Format: isbn:9782732497150 (used by Calibre polish) + valid_id_found = True + elif text.lower().startswith('asin:') or text.lower().startswith('mobi-asin:'): + # Amazon identifiers + valid_id_found = True elif text.lower().startswith('urn:uuid'): # We treat generic UUIDs as valid unless we are strict about "No Calibre UUID ONLY". # The rule: "Vide, ou ne contient aucun identifiant valide (ni ISBN, ni un URN standard). Si uniquement un UUID Calibre est présent -> Erreur." @@ -212,16 +223,20 @@ def validate_epub(filepath: str, root_dir: str) -> VerificationResult: # Look for # Then look for in cover_meta = None - if metadata: + if metadata is not None: # - for meta in metadata.findall(f"{{http://www.idpf.org/2007/opf}}meta"): # opf:meta not usually in 2.0? - pass - # The 'name' attribute is not in a namespace usually for OPF 2.0 meta elements? - # Actually in OPF 2.0: - for meta in metadata.findall("meta"): # Searching without namespace + # Try with namespace first (correct for valid EPUB 2) + for meta in metadata.findall(f"{{http://www.idpf.org/2007/opf}}meta"): if meta.get('name') == 'cover': cover_meta = meta.get('content') break + + # Fallback: search without namespace + if not cover_meta: + for meta in metadata.findall("meta"): + if meta.get('name') == 'cover': + cover_meta = meta.get('content') + break # Also check namespaced meta if parsing strict OPF 3.0? # element is in 3.0? # User asked for "cover" meta specifically or general cover check.