From c5f358e07d10b446cbfdaca663ba990db9a00d5f Mon Sep 17 00:00:00 2001 From: Maxime Killinger Date: Wed, 7 Jan 2026 23:31:59 +0000 Subject: [PATCH] feat: init epub audit tool (distroless/python) - Implemented 'audit_epubs.py' using standard libraries (zipfile, xml.etree) for high performance. - Created optimized Dockerfile based on 'gcr.io/distroless/python3-debian12' for minimal security attack surface. - Added strict metadata validation rules. - Included CI/CD workflow for Gitea Actions. - Added comprehensive README.md. --- .gitea/workflows/package.yaml | 35 ++++ Dockerfile | 21 +++ README.md | 47 +++++ audit_epubs.py | 316 ++++++++++++++++++++++++++++++++++ 4 files changed, 419 insertions(+) create mode 100644 .gitea/workflows/package.yaml create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 audit_epubs.py diff --git a/.gitea/workflows/package.yaml b/.gitea/workflows/package.yaml new file mode 100644 index 0000000..d8fd63c --- /dev/null +++ b/.gitea/workflows/package.yaml @@ -0,0 +1,35 @@ +name: 🚀 Docker Build and Push + +on: [push] + +jobs: + build-and-push: + runs-on: ubuntu-latest + steps: + - name: 📥 Checkout code + uses: https://github.com/actions/checkout@v4 + + - name: 🛠️ Set up Docker Buildx + uses: https://github.com/docker/setup-buildx-action@v3 + + - name: 🔐 Login to Gitea Registry + uses: https://github.com/docker/login-action@v3 + with: + registry: gitea.killinger.fr + username: maxime.killinger + password: ${{ secrets.DOCKER_TOKEN }} + + - name: 📦 Build and push Docker image + uses: https://github.com/docker/build-push-action@v5 + with: + context: . + push: true + tags: | + gitea.killinger.fr/maxime.killinger/epub-audit:${{ github.ref_name == 'main' && 'latest' || github.ref_name }} + + - name: 🔔 Trigger Watchtower + if: github.ref == 'refs/heads/main' + env: + TOKEN: ${{ secrets.WATCHTOWER_TOKEN }} + run: | + curl -X GET -H "Authorization: Bearer $TOKEN" http://192.168.1.118:3026/v1/update diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..28b9f53 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +# Build Stage +FROM python:3.11-slim-bookworm AS builder +WORKDIR /app +COPY audit_epubs.py . + +# Final Stage: Distroless +FROM gcr.io/distroless/python3-debian12 + +# OCI Labels +LABEL org.opencontainers.image.title="EPUB Audit Tool" \ + org.opencontainers.image.description="Outil d'audit de métadonnées de livres numériques EPUB (Distroless/Python)" \ + org.opencontainers.image.authors="Maxime Killinger" \ + org.opencontainers.image.source="https://gitea.killinger.fr/maxime.killinger/epub-audit" \ + org.opencontainers.image.licenses="MIT" + +WORKDIR /app +COPY --from=builder /app/audit_epubs.py . + +# Distroless python entrypoint is already python3 +# We just provide the script and default argument +CMD ["audit_epubs.py", "/books"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..5d624d4 --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +# Outil d'Audit de Métadonnées EPUB + +Cet outil analyse récursivement un répertoire de livres numériques (`.epub`) pour détecter les métadonnées manquantes ou mal formatées, selon des critères stricts. + +## Fonctionnalités + +* **Léger & Rapide** : Utilise uniquement la librairie standard Python (`zipfile`, `xml.etree`). Aucune dépendance externe. +* **Conteneurisé** : Image Docker Alpine optimisée (Multi-stage build). +* **Règles de Validation Strictes** : + * **Titre** : Pas d'underscores, pas d'extensions, pas purement numérique. + * **Auteur** : Pas de virgules (Format "Nom, Prénom" interdit), pas de "Calibre", pas vide. + * **Identifiants** : Doit avoir un ISBN ou un URN valide (autre que Calibre UUID seul). + * **Éditeur** : Pas d'URL, pas générique ("Unknown"). + * **Couverture** : Vérification de la présence dans le manifest et les métadonnées. + +## Utilisation + +### Avec Docker (Recommandé) + +1. **Construire l'image** : + ```bash + docker build -t epub-audit . + ``` + +2. **Lancer l'audit** (en montant votre dossier de livres dans `/books`) : + ```bash + docker run --rm -v /mnt/user/media/books/ebooks:/books epub-audit + ``` + +### En local (Python 3) + +```bash +python3 audit_epubs.py /chemin/vers/vos/livres +``` + +## Format de Sortie + +Le script affiche une arborescence ASCII des erreurs : + +```text +📚 Audit Ebook Exhaustif +📖 [Nom du fichier] + 🔗 [Chemin relatif] + └ ❌ Erreur détectée... +... +✅ [X] Livres parfaits / [Y] Livres avec erreurs +``` diff --git a/audit_epubs.py b/audit_epubs.py new file mode 100644 index 0000000..952635b --- /dev/null +++ b/audit_epubs.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +import os +import sys +import zipfile +import xml.etree.ElementTree as ET +import re +import argparse +from typing import List, Dict, Optional + +# --- Configuration & Constants --- +NS = {'opf': 'http://www.idpf.org/2007/opf', 'dc': 'http://purl.org/dc/elements/1.1/'} +# Register namespaces to make finding elements easier if needed, though usually we strictly rely on our map +for prefix, uri in NS.items(): + ET.register_namespace(prefix, uri) + +class VerificationResult: + def __init__(self, filename: str, rel_path: str): + self.filename = filename + self.rel_path = rel_path + self.errors: List[str] = [] + + def add_error(self, error: str): + self.errors.append(error) + + def is_valid(self) -> bool: + return len(self.errors) == 0 + +def get_opf_path(epub_zip: zipfile.ZipFile) -> Optional[str]: + try: + with epub_zip.open('META-INF/container.xml') as container_file: + tree = ET.parse(container_file) + root = tree.getroot() + # Namespace for container is usually urn:oasis:names:tc:opendocument:xmlns:container + # But we can just search for rootfile + for rootfile in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile"): + return rootfile.get('full-path') + except Exception: + return None + return None + +def validate_epub(filepath: str, root_dir: str) -> VerificationResult: + rel_path = os.path.relpath(filepath, root_dir) + filename = os.path.basename(filepath) + result = VerificationResult(filename, rel_path) + + try: + if not zipfile.is_zipfile(filepath): + result.add_error("Fichier corrompu ou non valide (pas un zip)") + return result + + with zipfile.ZipFile(filepath, 'r') as zf: + opf_path = get_opf_path(zf) + if not opf_path: + result.add_error("Fichier OPF introuvable (structure invalide)") + return result + + # Check if OPF exists in zip + if opf_path not in zf.namelist(): + result.add_error(f"Fichier OPF déclaré mais absent: {opf_path}") + return result + + with zf.open(opf_path) as opf_file: + try: + tree = ET.parse(opf_file) + root = tree.getroot() + except ET.ParseError: + result.add_error("Erreur de parsing XML du fichier OPF") + return result + + # Extract Metadata + metadata = root.find(f".//{{http://www.idpf.org/2007/opf}}metadata") + if metadata is None: + # Fallback for older versions or if namespace is missing/different extraction needed + # Try finding metadata without namespace if strict fails or check version? + # Actually standard requires namespace. Let's try flexible search if strict fails. + metadata = root.find("metadata") + if metadata is None: + # Try with search + pass + + # Helper to get text content safely + def get_text(elem_name: str, strict_ns=True) -> Optional[str]: + if metadata is None: return None + # DC elements + elem = metadata.find(f"dc:{elem_name}", NS) + if elem is not None and elem.text: + return elem.text.strip() + return None + + def get_all_text(elem_name: str) -> List[str]: + if metadata is None: return [] + return [e.text.strip() for e in metadata.findall(f"dc:{elem_name}", NS) if e.text] + + # 1. Title + title = get_text('title') + if not title: + result.add_error("Titre vide") + else: + if '_' in title: + result.add_error("Titre sale (Underscores)") + if title.lower().endswith('.epub'): + result.add_error("Titre sale (Extension .epub)") + if title.isdigit(): + result.add_error("Titre sale (Numérique)") + + # 2. Author + authors = get_all_text('creator') + if not authors: + result.add_error("Auteur vide ou 'Unknown'") + else: + for author in authors: + if not author or author.lower() == 'unknown': + result.add_error(f"Auteur invalide ('{author}')") + if ',' in author: + result.add_error(f"Auteur format invalide (Contient virgule): {author}") + if 'calibre' in author.lower(): + result.add_error(f"Auteur sale (Contient 'Calibre'): {author}") + + # 3. Year/Date + date = get_text('date') + if not date: + result.add_error("Date vide") + else: + # Extract year + match = re.search(r'(\d{4})', date) + if not match: + result.add_error(f"Date invalide (pas 4 chiffres): {date}") + else: + year = int(match.group(1)) + import datetime + current_year = datetime.datetime.now().year + if year > current_year + 1: # Allow +1 for upcoming + result.add_error(f"Année future: {year}") + + # 4. Publisher + publisher = get_text('publisher') + if not publisher: + result.add_error("Éditeur manquant") + else: + if re.search(r'https?://', publisher) or re.search(r'www\.', publisher): + result.add_error("Éditeur sale (URL détectée)") + if publisher.lower() in ['unknown', 'smashwords']: + result.add_error(f"Éditeur générique ({publisher})") + + # 5. Identifier (ISBN/URN) + identifiers = metadata.findall(f"dc:identifier", NS) if metadata else [] + valid_id_found = False + has_calibre_uuid = False + + for ident in identifiers: + text = ident.text.strip() if ident.text else "" + scheme = ident.get(f"{{http://www.idpf.org/2007/opf}}scheme", "").lower() + # Also check without namespace if needed or attributes? + # Sometimes scheme is not namespaced in attributes depending on version? No, OPF says attributes are not namespaced usually unless distinct? + # Actually opf:scheme is deprecated in 2.0, standard is distinct. + # Let's just check the text content for ISBN-like or urn patterns. + + if not text: continue + + if 'calibre' in text.lower() or 'uuid' in text.lower(): + # Check if it is a lustreless UUID from calibre + pass + + # Check for ISBN + clean_id = text.replace('-', '').replace(' ', '') + if text.lower().startswith('urn:'): + if 'uuid' not in text.lower(): # Basic URN + valid_id_found = True + # If it is urn:uuid: it is valid unless it is ONLY that? + # The rule: "Si uniquement un UUID Calibre est présent -> Erreur." + # UUIDs are standard. But maybe user means *Calibre specific* uuid scheme? + # Often Calibre adds ... + pass + + if scheme == 'calibre': + has_calibre_uuid = True + continue # Don't count as valid ID for the "Only Calibre" rule check + elif scheme == 'isbn': + valid_id_found = True + else: + # Heuristics + if re.match(r'^\d{9}[\d|X]$', clean_id) or re.match(r'^\d{13}$', clean_id): + valid_id_found = True + elif text.lower().startswith('urn:isbn'): + valid_id_found = True + elif text.lower().startswith('urn:uuid'): + # We treat generic UUIDs as valid unless we are strict about "No Calibre UUID ONLY". + # The rule: "Vide, ou ne contient aucun identifiant valide (ni ISBN, ni un URN standard). Si uniquement un UUID Calibre est présent -> Erreur." + # Usually a book has a UUID. + # I will assume "UUID Calibre" means an identifier with scheme="calibre" or text that looks like a calibre uuid if it's the ONLY one. + valid_id_found = True + + if not valid_id_found and not identifiers: + result.add_error("Identifiant vide") + elif not valid_id_found and identifiers: + # Check if we only had calibre + # If we are here, we saw identifiers but none marked valid. + # If all were skipped because scheme='calibre', then error. + result.add_error("Aucun identifiant valide (Uniquement Calibre/Inconnu)") + + # 6. Language + lang = get_text('language') # dc:language + if not lang: + result.add_error("Langue vide") + else: + # Check valid code (2 or 3 letters) + if not re.match(r'^[a-zA-Z]{2,3}(-[a-zA-Z]+)?$', lang): + result.add_error(f"Code langue invalide: {lang}") + + # 7. Cover + # Strategy: + # Look for + # Then look for in + cover_meta = None + if metadata: + # + for meta in metadata.findall(f"{{http://www.idpf.org/2007/opf}}meta"): # opf:meta not usually in 2.0? + pass + # The 'name' attribute is not in a namespace usually for OPF 2.0 meta elements? + # Actually in OPF 2.0: + for meta in metadata.findall("meta"): # Searching without namespace + if meta.get('name') == 'cover': + cover_meta = meta.get('content') + break + # Also check namespaced meta if parsing strict OPF 3.0? + # element is in 3.0? + # User asked for "cover" meta specifically or general cover check. + + manifest = root.find(f"{{http://www.idpf.org/2007/opf}}manifest") + if not manifest: + # Try without namespace + manifest = root.find("manifest") + + cover_found = False + if cover_meta and manifest: + # Check if item exists + # search item with id=cover_meta + for item in manifest.findall(f"{{http://www.idpf.org/2007/opf}}item"): + if item.get('id') == cover_meta: + cover_found = True + break + if not cover_found: + # Try without namespace + for item in manifest.findall("item"): + if item.get('id') == cover_meta: + cover_found = True + break + + # Alternative: check for item with properties="cover-image" (EPUB 3) + if not cover_found and manifest: + for item in manifest.findall(f"{{http://www.idpf.org/2007/opf}}item"): + props = item.get('properties', '') + if 'cover-image' in props.split(): + cover_found = True + break + + if not cover_found: + result.add_error("Couverture absente (Manifest ou Meta manquant)") + + except Exception as e: + result.add_error(f"Erreur technique: {str(e)}") + + return result + +def main(): + parser = argparse.ArgumentParser(description='Audit EPUB Metadata') + parser.add_argument('directory', help='Root directory to scan') + args = parser.parse_args() + + root_dir = os.path.abspath(args.directory) + + if not os.path.exists(root_dir): + print(f"Erreur: Le dossier '{root_dir}' n'existe pas.") + sys.exit(1) + + print("📚 Audit Ebook Exhaustif") + + perfect_count = 0 + error_count = 0 + + # Recursive walk + for current_root, dirs, files in os.walk(root_dir): + # Sort for consistent output? + files.sort() + for filename in files: + if filename.lower().endswith('.epub'): + filepath = os.path.join(current_root, filename) + res = validate_epub(filepath, root_dir) + + if res.is_valid(): + perfect_count += 1 + # Usually we verify output requests... + # "Le script doit générer exactement ce visuel (arborescence ASCII) :" + # And implies we list files with errors? + else: + error_count += 1 + print(f"📖 {res.filename}") + print(f" 🔗 {res.rel_path}") + for err in res.errors: + # Determine icon based on error text? + icon = "❌" + if "manquant" in err.lower() or "absent" in err.lower(): + if "éditeur" in err.lower(): # specifically warning for editor? User prompt: "⚠️ Éditeur manquant" + icon = "⚠️" + else: + icon = "❌" # Cover "Couverture absente" is ❌ in user example + elif "sale" in err.lower(): + icon = "❌" + + print(f" └ {icon} {err}") + + print("...") # As per user request example? Or just separating? + print(f"✅ [{perfect_count}] Livres parfaits / [{error_count}] Livres avec erreurs") + +if __name__ == '__main__': + main()