epub-audit/audit_epubs.py

#!/usr/bin/env python3
import os
import sys
import zipfile
import xml.etree.ElementTree as ET
import re
import argparse
from typing import List, Dict, Optional

# --- Configuration & Constants ---
NS = {'opf': 'http://www.idpf.org/2007/opf', 'dc': 'http://purl.org/dc/elements/1.1/'}
# Register namespaces to make finding elements easier if needed, though usually we strictly rely on our map
for prefix, uri in NS.items():
    ET.register_namespace(prefix, uri)

class VerificationResult:
    def __init__(self, filename: str, rel_path: str):
        self.filename = filename
        self.rel_path = rel_path
        self.errors: List[str] = []

    def add_error(self, error: str):
        self.errors.append(error)

    def is_valid(self) -> bool:
        return len(self.errors) == 0

def get_opf_path(epub_zip: zipfile.ZipFile) -> Optional[str]:
    try:
        with epub_zip.open('META-INF/container.xml') as container_file:
            tree = ET.parse(container_file)
            root = tree.getroot()
            # Namespace for container is usually urn:oasis:names:tc:opendocument:xmlns:container
            # But we can just search for rootfile
            for rootfile in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile"):
                return rootfile.get('full-path')
    except Exception:
        return None
    return None

def validate_epub(filepath: str, root_dir: str) -> VerificationResult:
    rel_path = os.path.relpath(filepath, root_dir)
    filename = os.path.basename(filepath)
    result = VerificationResult(filename, rel_path)

    try:
        if not zipfile.is_zipfile(filepath):
            result.add_error("Fichier corrompu ou non valide (pas un zip)")
            return result

        with zipfile.ZipFile(filepath, 'r') as zf:
            opf_path = get_opf_path(zf)
            if not opf_path:
                result.add_error("Fichier OPF introuvable (structure invalide)")
                return result

            # Check if OPF exists in zip
            if opf_path not in zf.namelist():
                result.add_error(f"Fichier OPF déclaré mais absent: {opf_path}")
                return result

            with zf.open(opf_path) as opf_file:
                try:
                    tree = ET.parse(opf_file)
                    root = tree.getroot()
                except ET.ParseError:
                    result.add_error("Erreur de parsing XML du fichier OPF")
                    return result

            # Extract Metadata
            metadata = root.find(f".//{{http://www.idpf.org/2007/opf}}metadata")
            if metadata is None:
                # Fallback for older versions or if namespace is missing/different extraction needed
                # Try finding metadata without namespace if strict fails or check version?
                # Actually standard requires namespace. Let's try flexible search if strict fails.
                metadata = root.find("metadata")
                if metadata is None:
                     # Try with search
                     pass

            # Helper to get text content safely
            def get_text(elem_name: str, strict_ns=True) -> Optional[str]:
                if metadata is None: return None
                # DC elements
                elem = metadata.find(f"dc:{elem_name}", NS)
                if elem is not None and elem.text:
                    return elem.text.strip()
                return None

            def get_all_text(elem_name: str) -> List[str]:
                if metadata is None: return []
                return [e.text.strip() for e in metadata.findall(f"dc:{elem_name}", NS) if e.text]

            # 1. Title
            title = get_text('title')
            if not title:
                result.add_error("Titre vide")
            else:
                if '_' in title:
                    result.add_error("Titre sale (Underscores)")
                if title.lower().endswith('.epub'):
                    result.add_error("Titre sale (Extension .epub)")
                if title.isdigit():
                    result.add_error("Titre sale (Numérique)")

            # 2. Author
            authors = get_all_text('creator')
            if not authors:
                result.add_error("Auteur vide ou 'Unknown'")
            else:
                for author in authors:
                    if not author or author.lower() == 'unknown':
                        result.add_error(f"Auteur invalide ('{author}')")
                    if ',' in author:
                        result.add_error(f"Auteur format invalide (Contient virgule): {author}")
                    if 'calibre' in author.lower():
                        result.add_error(f"Auteur sale (Contient 'Calibre'): {author}")

            # 3. Year/Date
            date = get_text('date')
            if not date:
                result.add_error("Date vide")
            else:
                # Extract year
                match = re.search(r'(\d{4})', date)
                if not match:
                    result.add_error(f"Date invalide (pas 4 chiffres): {date}")
                else:
                    year = int(match.group(1))
                    import datetime
                    current_year = datetime.datetime.now().year
                    if year > current_year + 1: # Allow +1 for upcoming
                         result.add_error(f"Année future: {year}")

            # 4. Publisher
            publisher = get_text('publisher')
            if not publisher:
                result.add_error("Éditeur manquant")
            else:
                if re.search(r'https?://', publisher) or re.search(r'www\.', publisher):
                    result.add_error("Éditeur sale (URL détectée)")
                if publisher.lower() in ['unknown', 'smashwords']:
                     result.add_error(f"Éditeur générique ({publisher})")

            # 5. Identifier (ISBN/URN)
            identifiers = metadata.findall(f"dc:identifier", NS) if metadata else []
            valid_id_found = False
            has_calibre_uuid = False

            for ident in identifiers:
                text = ident.text.strip() if ident.text else ""
                scheme = ident.get(f"{{http://www.idpf.org/2007/opf}}scheme", "").lower()
                # Also check without namespace if needed or attributes?
                # Sometimes scheme is not namespaced in attributes depending on version? No, OPF says attributes are not namespaced usually unless distinct?
                # Actually opf:scheme is deprecated in 2.0, standard is distinct.
                # Let's just check the text content for ISBN-like or urn patterns.

                if not text: continue

                if 'calibre' in text.lower() or 'uuid' in text.lower():
                     # Check if it is a lustreless UUID from calibre
                     pass

                # Check for ISBN
                clean_id = text.replace('-', '').replace(' ', '')
                if text.lower().startswith('urn:'):
                     if 'uuid' not in text.lower(): # Basic URN
                          valid_id_found = True
                     # If it is urn:uuid: it is valid unless it is ONLY that?
                     # The rule: "Si uniquement un UUID Calibre est présent -> Erreur."
                     # UUIDs are standard. But maybe user means *Calibre specific* uuid scheme?
                     # Often Calibre adds <dc:identifier opf:scheme="calibre">...</dc:identifier>
                     pass

                if scheme == 'calibre':
                    has_calibre_uuid = True
                    continue # Don't count as valid ID for the "Only Calibre" rule check
                elif scheme == 'isbn':
                    valid_id_found = True
                else:
                     # Heuristics
                     if re.match(r'^\d{9}[\d|X]$', clean_id) or re.match(r'^\d{13}$', clean_id):
                          valid_id_found = True
                     elif text.lower().startswith('urn:isbn'):
                          valid_id_found = True
                     elif text.lower().startswith('urn:uuid'):
                          # We treat generic UUIDs as valid unless we are strict about "No Calibre UUID ONLY".
                          # The rule: "Vide, ou ne contient aucun identifiant valide (ni ISBN, ni un URN standard). Si uniquement un UUID Calibre est présent -> Erreur."
                          # Usually a book has a UUID.
                          # I will assume "UUID Calibre" means an identifier with scheme="calibre" or text that looks like a calibre uuid if it's the ONLY one.
                          valid_id_found = True

            if not valid_id_found and not identifiers:
                 result.add_error("Identifiant vide")
            elif not valid_id_found and identifiers:
                 # Check if we only had calibre
                 # If we are here, we saw identifiers but none marked valid.
                 # If all were skipped because scheme='calibre', then error.
                 result.add_error("Aucun identifiant valide (Uniquement Calibre/Inconnu)")

            # 6. Language
            lang = get_text('language') # dc:language
            if not lang:
                result.add_error("Langue vide")
            else:
                 # Check valid code (2 or 3 letters)
                 if not re.match(r'^[a-zA-Z]{2,3}(-[a-zA-Z]+)?$', lang):
                      result.add_error(f"Code langue invalide: {lang}")

            # 7. Cover
            # Strategy:
            # Look for <meta name="cover" content="item_id" />
            # Then look for <item id="item_id" href="..." /> in <manifest>
            cover_meta = None
            if metadata:
                # <meta name="cover" content="...">
                for meta in metadata.findall(f"{{http://www.idpf.org/2007/opf}}meta"): # opf:meta not usually in 2.0?
                    pass
                # The 'name' attribute is not in a namespace usually for OPF 2.0 meta elements?
                # Actually in OPF 2.0: <meta name="cover" content="cover-image" />
                for meta in metadata.findall("meta"): # Searching without namespace
                     if meta.get('name') == 'cover':
                          cover_meta = meta.get('content')
                          break
                # Also check namespaced meta if parsing strict OPF 3.0?
                # element is <meta property="cover-image"> in 3.0?
                # User asked for "cover" meta specifically or general cover check.

            manifest = root.find(f"{{http://www.idpf.org/2007/opf}}manifest")
            if not manifest:
                 # Try without namespace
                 manifest = root.find("manifest")

            cover_found = False
            if cover_meta and manifest:
                 # Check if item exists
                 # search item with id=cover_meta
                 for item in manifest.findall(f"{{http://www.idpf.org/2007/opf}}item"):
                      if item.get('id') == cover_meta:
                           cover_found = True
                           break
                 if not cover_found:
                      # Try without namespace
                      for item in manifest.findall("item"):
                           if item.get('id') == cover_meta:
                                cover_found = True
                                break

            # Alternative: check for item with properties="cover-image" (EPUB 3)
            if not cover_found and manifest:
                 for item in manifest.findall(f"{{http://www.idpf.org/2007/opf}}item"):
                      props = item.get('properties', '')
                      if 'cover-image' in props.split():
                           cover_found = True
                           break

            if not cover_found:
                 result.add_error("Couverture absente (Manifest ou Meta manquant)")

    except Exception as e:
        result.add_error(f"Erreur technique: {str(e)}")

    return result

def main():
    parser = argparse.ArgumentParser(description='Audit EPUB Metadata')
    parser.add_argument('directory', help='Root directory to scan')
    args = parser.parse_args()

    root_dir = os.path.abspath(args.directory)

    if not os.path.exists(root_dir):
        print(f"Erreur: Le dossier '{root_dir}' n'existe pas.")
        sys.exit(1)

    print("📚 Audit Ebook Exhaustif")

    perfect_count = 0
    error_count = 0

    # Recursive walk
    for current_root, dirs, files in os.walk(root_dir):
        # Sort for consistent output?
        files.sort()
        for filename in files:
            if filename.lower().endswith('.epub'):
                filepath = os.path.join(current_root, filename)
                res = validate_epub(filepath, root_dir)

                if res.is_valid():
                    perfect_count += 1
                    # Usually we verify output requests...
                    # "Le script doit générer exactement ce visuel (arborescence ASCII) :"
                    # And implies we list files with errors?
                else:
                    error_count += 1
                    print(f"📖 {res.filename}")
                    print(f"   🔗 {res.rel_path}")
                    for err in res.errors:
                         # Determine icon based on error text?
                         icon = "❌"
                         if "manquant" in err.lower() or "absent" in err.lower():
                              if "éditeur" in err.lower(): # specifically warning for editor? User prompt: "⚠️ Éditeur manquant"
                                   icon = "⚠️"
                              else:
                                   icon = "❌" # Cover "Couverture absente" is ❌ in user example
                         elif "sale" in err.lower():
                              icon = "❌"

                         print(f"   └ {icon} {err}")

    print("...") # As per user request example? Or just separating?
    print(f"✅ [{perfect_count}] Livres parfaits / [{error_count}] Livres avec erreurs")

if __name__ == '__main__':
    main()