feat: init epub audit tool (distroless/python)

- Implemented 'audit_epubs.py' using standard libraries (zipfile, xml.etree) for high performance. - Created optimized Dockerfile based on 'gcr.io/distroless/python3-debian12' for minimal security attack surface. - Added strict metadata validation rules. - Included CI/CD workflow for Gitea Actions. - Added comprehensive README.md.
2026-01-07 23:31:59 +00:00
commit c5f358e07d
4 changed files with 419 additions and 0 deletions
@@ -0,0 +1,35 @@
+name: 🚀 Docker Build and Push
+
+on: [push]
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 📥 Checkout code
+        uses: https://github.com/actions/checkout@v4
+
+      - name: 🛠️ Set up Docker Buildx
+        uses: https://github.com/docker/setup-buildx-action@v3
+
+      - name: 🔐 Login to Gitea Registry
+        uses: https://github.com/docker/login-action@v3
+        with:
+          registry: gitea.killinger.fr
+          username: maxime.killinger
+          password: ${{ secrets.DOCKER_TOKEN }}
+
+      - name: 📦 Build and push Docker image
+        uses: https://github.com/docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          tags: |
+            gitea.killinger.fr/maxime.killinger/epub-audit:${{ github.ref_name == 'main' && 'latest' || github.ref_name }}
+
+      - name: 🔔 Trigger Watchtower
+        if: github.ref == 'refs/heads/main'
+        env:
+          TOKEN: ${{ secrets.WATCHTOWER_TOKEN }}
+        run: |
+          curl -X GET -H "Authorization: Bearer $TOKEN" http://192.168.1.118:3026/v1/update
@@ -0,0 +1,21 @@
+# Build Stage
+FROM python:3.11-slim-bookworm AS builder
+WORKDIR /app
+COPY audit_epubs.py .
+
+# Final Stage: Distroless
+FROM gcr.io/distroless/python3-debian12
+
+# OCI Labels
+LABEL org.opencontainers.image.title="EPUB Audit Tool" \
+      org.opencontainers.image.description="Outil d'audit de métadonnées de livres numériques EPUB (Distroless/Python)" \
+      org.opencontainers.image.authors="Maxime Killinger" \
+      org.opencontainers.image.source="https://gitea.killinger.fr/maxime.killinger/epub-audit" \
+      org.opencontainers.image.licenses="MIT"
+
+WORKDIR /app
+COPY --from=builder /app/audit_epubs.py .
+
+# Distroless python entrypoint is already python3
+# We just provide the script and default argument
+CMD ["audit_epubs.py", "/books"]
@@ -0,0 +1,47 @@
+# Outil d'Audit de Métadonnées EPUB
+
+Cet outil analyse récursivement un répertoire de livres numériques (`.epub`) pour détecter les métadonnées manquantes ou mal formatées, selon des critères stricts.
+
+## Fonctionnalités
+
+*   **Léger & Rapide** : Utilise uniquement la librairie standard Python (`zipfile`, `xml.etree`). Aucune dépendance externe.
+*   **Conteneurisé** : Image Docker Alpine optimisée (Multi-stage build).
+*   **Règles de Validation Strictes** :
+    *   **Titre** : Pas d'underscores, pas d'extensions, pas purement numérique.
+    *   **Auteur** : Pas de virgules (Format "Nom, Prénom" interdit), pas de "Calibre", pas vide.
+    *   **Identifiants** : Doit avoir un ISBN ou un URN valide (autre que Calibre UUID seul).
+    *   **Éditeur** : Pas d'URL, pas générique ("Unknown").
+    *   **Couverture** : Vérification de la présence dans le manifest et les métadonnées.
+
+## Utilisation
+
+### Avec Docker (Recommandé)
+
+1.  **Construire l'image** :
+    ```bash
+    docker build -t epub-audit .
+    ```
+
+2.  **Lancer l'audit** (en montant votre dossier de livres dans `/books`) :
+    ```bash
+    docker run --rm -v /mnt/user/media/books/ebooks:/books epub-audit
+    ```
+
+### En local (Python 3)
+
+```bash
+python3 audit_epubs.py /chemin/vers/vos/livres
+```
+
+## Format de Sortie
+
+Le script affiche une arborescence ASCII des erreurs :
+
+```text
+📚 Audit Ebook Exhaustif
+📖 [Nom du fichier]
+   🔗 [Chemin relatif]
+   └ ❌ Erreur détectée...
+...
+✅ [X] Livres parfaits / [Y] Livres avec erreurs
+```
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+import os
+import sys
+import zipfile
+import xml.etree.ElementTree as ET
+import re
+import argparse
+from typing import List, Dict, Optional
+
+# --- Configuration & Constants ---
+NS = {'opf': 'http://www.idpf.org/2007/opf', 'dc': 'http://purl.org/dc/elements/1.1/'}
+# Register namespaces to make finding elements easier if needed, though usually we strictly rely on our map
+for prefix, uri in NS.items():
+    ET.register_namespace(prefix, uri)
+
+class VerificationResult:
+    def __init__(self, filename: str, rel_path: str):
+        self.filename = filename
+        self.rel_path = rel_path
+        self.errors: List[str] = []
+
+    def add_error(self, error: str):
+        self.errors.append(error)
+
+    def is_valid(self) -> bool:
+        return len(self.errors) == 0
+
+def get_opf_path(epub_zip: zipfile.ZipFile) -> Optional[str]:
+    try:
+        with epub_zip.open('META-INF/container.xml') as container_file:
+            tree = ET.parse(container_file)
+            root = tree.getroot()
+            # Namespace for container is usually urn:oasis:names:tc:opendocument:xmlns:container
+            # But we can just search for rootfile
+            for rootfile in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile"):
+                return rootfile.get('full-path')
+    except Exception:
+        return None
+    return None
+
+def validate_epub(filepath: str, root_dir: str) -> VerificationResult:
+    rel_path = os.path.relpath(filepath, root_dir)
+    filename = os.path.basename(filepath)
+    result = VerificationResult(filename, rel_path)
+
+    try:
+        if not zipfile.is_zipfile(filepath):
+            result.add_error("Fichier corrompu ou non valide (pas un zip)")
+            return result
+
+        with zipfile.ZipFile(filepath, 'r') as zf:
+            opf_path = get_opf_path(zf)
+            if not opf_path:
+                result.add_error("Fichier OPF introuvable (structure invalide)")
+                return result
+
+            # Check if OPF exists in zip
+            if opf_path not in zf.namelist():
+                result.add_error(f"Fichier OPF déclaré mais absent: {opf_path}")
+                return result
+
+            with zf.open(opf_path) as opf_file:
+                try:
+                    tree = ET.parse(opf_file)
+                    root = tree.getroot()
+                except ET.ParseError:
+                    result.add_error("Erreur de parsing XML du fichier OPF")
+                    return result
+
+            # Extract Metadata
+            metadata = root.find(f".//{{http://www.idpf.org/2007/opf}}metadata")
+            if metadata is None:
+                # Fallback for older versions or if namespace is missing/different extraction needed
+                # Try finding metadata without namespace if strict fails or check version?
+                # Actually standard requires namespace. Let's try flexible search if strict fails.
+                metadata = root.find("metadata")
+                if metadata is None:
+                     # Try with search
+                     pass
+
+            # Helper to get text content safely
+            def get_text(elem_name: str, strict_ns=True) -> Optional[str]:
+                if metadata is None: return None
+                # DC elements
+                elem = metadata.find(f"dc:{elem_name}", NS)
+                if elem is not None and elem.text:
+                    return elem.text.strip()
+                return None
+            
+            def get_all_text(elem_name: str) -> List[str]:
+                if metadata is None: return []
+                return [e.text.strip() for e in metadata.findall(f"dc:{elem_name}", NS) if e.text]
+
+            # 1. Title
+            title = get_text('title')
+            if not title:
+                result.add_error("Titre vide")
+            else:
+                if '_' in title:
+                    result.add_error("Titre sale (Underscores)")
+                if title.lower().endswith('.epub'):
+                    result.add_error("Titre sale (Extension .epub)")
+                if title.isdigit():
+                    result.add_error("Titre sale (Numérique)")
+
+            # 2. Author
+            authors = get_all_text('creator')
+            if not authors:
+                result.add_error("Auteur vide ou 'Unknown'")
+            else:
+                for author in authors:
+                    if not author or author.lower() == 'unknown':
+                        result.add_error(f"Auteur invalide ('{author}')")
+                    if ',' in author:
+                        result.add_error(f"Auteur format invalide (Contient virgule): {author}")
+                    if 'calibre' in author.lower():
+                        result.add_error(f"Auteur sale (Contient 'Calibre'): {author}")
+
+            # 3. Year/Date
+            date = get_text('date')
+            if not date:
+                result.add_error("Date vide")
+            else:
+                # Extract year
+                match = re.search(r'(\d{4})', date)
+                if not match:
+                    result.add_error(f"Date invalide (pas 4 chiffres): {date}")
+                else:
+                    year = int(match.group(1))
+                    import datetime
+                    current_year = datetime.datetime.now().year
+                    if year > current_year + 1: # Allow +1 for upcoming
+                         result.add_error(f"Année future: {year}")
+
+            # 4. Publisher
+            publisher = get_text('publisher')
+            if not publisher:
+                result.add_error("Éditeur manquant")
+            else:
+                if re.search(r'https?://', publisher) or re.search(r'www\.', publisher):
+                    result.add_error("Éditeur sale (URL détectée)")
+                if publisher.lower() in ['unknown', 'smashwords']:
+                     result.add_error(f"Éditeur générique ({publisher})")
+
+            # 5. Identifier (ISBN/URN)
+            identifiers = metadata.findall(f"dc:identifier", NS) if metadata else []
+            valid_id_found = False
+            has_calibre_uuid = False
+            
+            for ident in identifiers:
+                text = ident.text.strip() if ident.text else ""
+                scheme = ident.get(f"{{http://www.idpf.org/2007/opf}}scheme", "").lower() 
+                # Also check without namespace if needed or attributes?
+                # Sometimes scheme is not namespaced in attributes depending on version? No, OPF says attributes are not namespaced usually unless distinct?
+                # Actually opf:scheme is deprecated in 2.0, standard is distinct.
+                # Let's just check the text content for ISBN-like or urn patterns.
+                
+                if not text: continue
+                
+                if 'calibre' in text.lower() or 'uuid' in text.lower():
+                     # Check if it is a lustreless UUID from calibre
+                     pass
+                
+                # Check for ISBN
+                clean_id = text.replace('-', '').replace(' ', '')
+                if text.lower().startswith('urn:'):
+                     if 'uuid' not in text.lower(): # Basic URN
+                          valid_id_found = True
+                     # If it is urn:uuid: it is valid unless it is ONLY that? 
+                     # The rule: "Si uniquement un UUID Calibre est présent -> Erreur."
+                     # UUIDs are standard. But maybe user means *Calibre specific* uuid scheme?
+                     # Often Calibre adds <dc:identifier opf:scheme="calibre">...</dc:identifier>
+                     pass
+
+                if scheme == 'calibre':
+                    has_calibre_uuid = True
+                    continue # Don't count as valid ID for the "Only Calibre" rule check
+                elif scheme == 'isbn':
+                    valid_id_found = True
+                else: 
+                     # Heuristics
+                     if re.match(r'^\d{9}[\d|X]$', clean_id) or re.match(r'^\d{13}$', clean_id):
+                          valid_id_found = True
+                     elif text.lower().startswith('urn:isbn'):
+                          valid_id_found = True
+                     elif text.lower().startswith('urn:uuid'):
+                          # We treat generic UUIDs as valid unless we are strict about "No Calibre UUID ONLY".
+                          # The rule: "Vide, ou ne contient aucun identifiant valide (ni ISBN, ni un URN standard). Si uniquement un UUID Calibre est présent -> Erreur."
+                          # Usually a book has a UUID.
+                          # I will assume "UUID Calibre" means an identifier with scheme="calibre" or text that looks like a calibre uuid if it's the ONLY one.
+                          valid_id_found = True
+
+            if not valid_id_found and not identifiers:
+                 result.add_error("Identifiant vide")
+            elif not valid_id_found and identifiers:
+                 # Check if we only had calibre
+                 # If we are here, we saw identifiers but none marked valid.
+                 # If all were skipped because scheme='calibre', then error.
+                 result.add_error("Aucun identifiant valide (Uniquement Calibre/Inconnu)")
+
+            # 6. Language
+            lang = get_text('language') # dc:language
+            if not lang:
+                result.add_error("Langue vide")
+            else:
+                 # Check valid code (2 or 3 letters)
+                 if not re.match(r'^[a-zA-Z]{2,3}(-[a-zA-Z]+)?$', lang):
+                      result.add_error(f"Code langue invalide: {lang}")
+
+            # 7. Cover
+            # Strategy:
+            # Look for <meta name="cover" content="item_id" />
+            # Then look for <item id="item_id" href="..." /> in <manifest>
+            cover_meta = None
+            if metadata:
+                # <meta name="cover" content="...">
+                for meta in metadata.findall(f"{{http://www.idpf.org/2007/opf}}meta"): # opf:meta not usually in 2.0?
+                    pass
+                # The 'name' attribute is not in a namespace usually for OPF 2.0 meta elements?
+                # Actually in OPF 2.0: <meta name="cover" content="cover-image" />
+                for meta in metadata.findall("meta"): # Searching without namespace
+                     if meta.get('name') == 'cover':
+                          cover_meta = meta.get('content')
+                          break
+                # Also check namespaced meta if parsing strict OPF 3.0? 
+                # element is <meta property="cover-image"> in 3.0?
+                # User asked for "cover" meta specifically or general cover check.
+                
+            manifest = root.find(f"{{http://www.idpf.org/2007/opf}}manifest")
+            if not manifest:
+                 # Try without namespace
+                 manifest = root.find("manifest")
+
+            cover_found = False
+            if cover_meta and manifest:
+                 # Check if item exists
+                 # search item with id=cover_meta
+                 for item in manifest.findall(f"{{http://www.idpf.org/2007/opf}}item"):
+                      if item.get('id') == cover_meta:
+                           cover_found = True
+                           break
+                 if not cover_found:
+                      # Try without namespace
+                      for item in manifest.findall("item"):
+                           if item.get('id') == cover_meta:
+                                cover_found = True
+                                break
+            
+            # Alternative: check for item with properties="cover-image" (EPUB 3)
+            if not cover_found and manifest:
+                 for item in manifest.findall(f"{{http://www.idpf.org/2007/opf}}item"):
+                      props = item.get('properties', '')
+                      if 'cover-image' in props.split():
+                           cover_found = True
+                           break
+            
+            if not cover_found:
+                 result.add_error("Couverture absente (Manifest ou Meta manquant)")
+
+    except Exception as e:
+        result.add_error(f"Erreur technique: {str(e)}")
+
+    return result
+
+def main():
+    parser = argparse.ArgumentParser(description='Audit EPUB Metadata')
+    parser.add_argument('directory', help='Root directory to scan')
+    args = parser.parse_args()
+
+    root_dir = os.path.abspath(args.directory)
+    
+    if not os.path.exists(root_dir):
+        print(f"Erreur: Le dossier '{root_dir}' n'existe pas.")
+        sys.exit(1)
+
+    print("📚 Audit Ebook Exhaustif")
+    
+    perfect_count = 0
+    error_count = 0
+
+    # Recursive walk
+    for current_root, dirs, files in os.walk(root_dir):
+        # Sort for consistent output?
+        files.sort()
+        for filename in files:
+            if filename.lower().endswith('.epub'):
+                filepath = os.path.join(current_root, filename)
+                res = validate_epub(filepath, root_dir)
+                
+                if res.is_valid():
+                    perfect_count += 1
+                    # Usually we verify output requests... 
+                    # "Le script doit générer exactement ce visuel (arborescence ASCII) :"
+                    # And implies we list files with errors?
+                else:
+                    error_count += 1
+                    print(f"📖 {res.filename}")
+                    print(f"   🔗 {res.rel_path}")
+                    for err in res.errors:
+                         # Determine icon based on error text?
+                         icon = "❌"
+                         if "manquant" in err.lower() or "absent" in err.lower():
+                              if "éditeur" in err.lower(): # specifically warning for editor? User prompt: "⚠️ Éditeur manquant"
+                                   icon = "⚠️"
+                              else:
+                                   icon = "❌" # Cover "Couverture absente" is ❌ in user example
+                         elif "sale" in err.lower():
+                              icon = "❌"
+                         
+                         print(f"   └ {icon} {err}")
+
+    print("...") # As per user request example? Or just separating?
+    print(f"✅ [{perfect_count}] Livres parfaits / [{error_count}] Livres avec erreurs")
+
+if __name__ == '__main__':
+    main()