PDF Metadata Cleaner Source Code

#!/usr/bin/env python3
"""
PDF Metadata Cleaner
Made by Ogisha

Uses exiftool and qpdf to remove metadata from PDF files and linearize them.
Designed for educational purposes and authorized document privacy analysis.
"""

import os
import sys
import shutil
import argparse
import tempfile
import subprocess
from pathlib import Path

# ANSI colors
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
MAGENTA = "\033[95m"
CYAN = "\033[96m"
WHITE = "\033[97m"


def print_banner():
    banner = f"""
{CYAN}{BOLD}
╔══════════════════════════════════════════════════════════════╗
║                    PDF METADATA CLEANER                      ║
║                        Made by Ogisha                        ║
╚══════════════════════════════════════════════════════════════╝
{RESET}
"""
    print(banner)


def print_info(label, value):
    print(f"{BOLD}{BLUE}[INFO]{RESET} {WHITE}{label}:{RESET} {value}")


def print_success(message):
    print(f"{BOLD}{GREEN}[OK]{RESET} {message}")


def print_warning(message):
    print(f"{BOLD}{YELLOW}[WARN]{RESET} {message}")


def print_error(message):
    print(f"{BOLD}{RED}[ERROR]{RESET} {message}")


def print_section(title):
    print(f"\n{MAGENTA}{BOLD}{'=' * 62}")
    print(f"{title.center(62)}")
    print(f"{'=' * 62}{RESET}")


def run_command(cmd):
    return subprocess.run(cmd, capture_output=True, text=True)


def check_dependencies():
    print_section("DEPENDENCY CHECK")

    dependencies = {
        "exiftool": {
            "macos": "brew install exiftool",
            "linux": "sudo apt-get install -y libimage-exiftool-perl",
        },
        "qpdf": {
            "macos": "brew install qpdf",
            "linux": "sudo apt-get install -y qpdf",
        },
    }

    all_ok = True

    for dep in dependencies:
        try:
            result = run_command([dep, "--version"])
            if result.returncode == 0:
                version_output = (result.stdout or result.stderr).strip().splitlines()
                version = version_output[0] if version_output else "installed"
                print_success(f"{dep} found ({version})")
            else:
                all_ok = False
                print_error(f"{dep} is installed but returned an error.")
        except FileNotFoundError:
            all_ok = False
            print_error(f"{dep} is not installed.")
            print(f"  {BOLD}Install on macOS:{RESET}  {dependencies[dep]['macos']}")
            print(f"  {BOLD}Install on Linux:{RESET}  {dependencies[dep]['linux']}")

    return all_ok


def remove_metadata_exiftool(input_pdf, output_pdf):
    print_section("STEP 1 - METADATA REMOVAL")
    print_info("Input PDF", input_pdf)
    print_info("Temporary output", output_pdf)

    try:
        cmd = [
            "exiftool",
            "-all=",
            "-TagsFromFile", "@",
            "-Author", "-Title", "-Subject", "-Keywords",
            "-Creator", "-Producer", "-CreationDate",
            "-ModDate", "-Trapped", "-Encrypt",
            "-o", output_pdf,
            input_pdf,
        ]

        result = run_command(cmd)

        if result.returncode == 0:
            print_success("Metadata successfully removed with exiftool.")
            return True

        print_error("Metadata removal failed.")
        if result.stderr.strip():
            print(result.stderr.strip())
        return False

    except Exception as e:
        print_error(f"Unexpected exiftool error: {e}")
        return False


def linearize_pdf_qpdf(input_pdf, output_pdf):
    print_section("STEP 2 - PDF LINEARIZATION")
    print_info("Input PDF", input_pdf)
    print_info("Temporary output", output_pdf)

    try:
        cmd = [
            "qpdf",
            "--linearize",
            "--object-streams=disable",
            "--remove-unreferenced-resources=yes",
            "--compress-streams=y",
            "--coalesce-contents",
            input_pdf,
            output_pdf,
        ]

        result = run_command(cmd)

        if result.returncode == 0:
            print_success("PDF successfully linearized with qpdf.")
            return True

        print_error("Linearization failed.")
        if result.stderr.strip():
            print(result.stderr.strip())
        return False

    except Exception as e:
        print_error(f"Unexpected qpdf error: {e}")
        return False


def verify_metadata_removal(pdf_path):
    print_section("STEP 3 - VERIFICATION")
    print_info("Checking file", pdf_path)

    try:
        result = run_command(["exiftool", pdf_path])

        if result.returncode != 0:
            print_error("Verification failed while running exiftool.")
            if result.stderr.strip():
                print(result.stderr.strip())
            return False

        output = result.stdout

        critical_metadata = [
            "Author",
            "Creator",
            "Producer",
            "Creation Date",
            "Modify Date",
            "Title",
            "Subject",
            "Keywords",
            "Trapped",
        ]

        found_metadata = [field for field in critical_metadata if field in output]

        if found_metadata:
            print_warning(f"Remaining metadata found: {', '.join(found_metadata)}")
            return False

        print_success("No critical metadata found.")
        return True

    except Exception as e:
        print_error(f"Verification error: {e}")
        return False


def clean_pdf(input_path, output_path=None):
    print_section("PDF CLEANING STARTED")

    if not os.path.exists(input_path):
        print_error(f"File not found: {input_path}")
        return False

    if output_path is None:
        input_dir = os.path.dirname(input_path)
        input_name = os.path.basename(input_path)
        name, ext = os.path.splitext(input_name)
        output_path = os.path.join(input_dir, f"{name}_cleaned{ext}")

    print_info("Source file", input_path)
    print_info("Output file", output_path)

    # Important: only reserve names, do not pre-create files
    fd1, tmp1_path = tempfile.mkstemp(suffix=".pdf")
    os.close(fd1)
    os.unlink(tmp1_path)

    fd2, tmp2_path = tempfile.mkstemp(suffix=".pdf")
    os.close(fd2)
    os.unlink(tmp2_path)

    try:
        if not remove_metadata_exiftool(input_path, tmp1_path):
            return False

        if not linearize_pdf_qpdf(tmp1_path, tmp2_path):
            return False

        shutil.copy2(tmp2_path, output_path)
        print_success(f"Cleaned PDF saved to: {output_path}")

        verified = verify_metadata_removal(output_path)

        print_section("FINAL RESULT")
        if verified:
            print_success("PDF successfully cleaned and verified.")
        else:
            print_warning("PDF cleaned, but verification found possible remaining metadata.")

        return True

    except Exception as e:
        print_error(f"Error during cleaning process: {e}")
        return False

    finally:
        for tmp_file in [tmp1_path, tmp2_path]:
            if os.path.exists(tmp_file):
                os.unlink(tmp_file)


def batch_clean_pdf(directory, recursive=False):
    print_section("BATCH MODE")

    directory = Path(directory)

    if not directory.exists():
        print_error(f"Directory not found: {directory}")
        return

    pattern = "**/*.pdf" if recursive else "*.pdf"
    pdf_files = list(directory.glob(pattern))

    if not pdf_files:
        print_warning("No PDF files found.")
        return

    print_info("Directory", directory)
    print_info("Recursive", recursive)
    print_info("PDF files found", len(pdf_files))

    success_count = 0
    processed_count = 0

    for pdf_file in pdf_files:
        if "_cleaned" in pdf_file.stem:
            print_warning(f"Skipping already cleaned file: {pdf_file.name}")
            continue

        processed_count += 1
        output_file = pdf_file.parent / f"{pdf_file.stem}_cleaned.pdf"

        print(f"\n{CYAN}{BOLD}--- Processing: {pdf_file.name} ---{RESET}")

        if clean_pdf(str(pdf_file), str(output_file)):
            success_count += 1

    print_section("BATCH SUMMARY")
    print_info("Eligible PDFs", processed_count)
    print_info("Successfully cleaned", success_count)
    print_info("Failed", processed_count - success_count)


def main():
    parser = argparse.ArgumentParser(
        description="Remove metadata from PDF files using exiftool and qpdf",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s document.pdf                    Clean a single file
  %(prog)s document.pdf -o clean.pdf       Specify output filename
  %(prog)s -d ./pdfs                       Clean all PDFs in directory
  %(prog)s -d ./pdfs -r                    Clean PDFs recursively
  %(prog)s document.pdf -c                 Check metadata only
        """
    )

    parser.add_argument("input", nargs="?", help="Input PDF file")
    parser.add_argument("-o", "--output", help="Output PDF file")
    parser.add_argument("-d", "--directory", help="Directory containing PDF files")
    parser.add_argument("-r", "--recursive", action="store_true",
                        help="Search subdirectories recursively")
    parser.add_argument("-c", "--check", action="store_true",
                        help="Only check metadata, do not remove")

    args = parser.parse_args()

    print_banner()

    if not check_dependencies():
        sys.exit(1)

    if args.check and args.input:
        print_section("CHECK MODE")
        print_info("Target file", args.input)
        verify_metadata_removal(args.input)
        sys.exit(0)

    if args.directory:
        batch_clean_pdf(args.directory, args.recursive)
    elif args.input:
        clean_pdf(args.input, args.output)
    else:
        parser.print_help()


if __name__ == "__main__":
    main()