"""
PDF Metadata Cleaner
Made by Ogisha
Uses exiftool and qpdf to remove metadata from PDF files and linearize them.
Designed for educational purposes and authorized document privacy analysis.
"""
import os
import sys
import shutil
import argparse
import tempfile
import subprocess
from pathlib import Path
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
RED = "\033[91m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
BLUE = "\033[94m"
MAGENTA = "\033[95m"
CYAN = "\033[96m"
WHITE = "\033[97m"
def print_banner():
banner = f"""
{CYAN}{BOLD}
╔══════════════════════════════════════════════════════════════╗
║ PDF METADATA CLEANER ║
║ Made by Ogisha ║
╚══════════════════════════════════════════════════════════════╝
{RESET}
"""
print(banner)
def print_info(label, value):
print(f"{BOLD}{BLUE}[INFO]{RESET} {WHITE}{label}:{RESET} {value}")
def print_success(message):
print(f"{BOLD}{GREEN}[OK]{RESET} {message}")
def print_warning(message):
print(f"{BOLD}{YELLOW}[WARN]{RESET} {message}")
def print_error(message):
print(f"{BOLD}{RED}[ERROR]{RESET} {message}")
def print_section(title):
print(f"\n{MAGENTA}{BOLD}{'=' * 62}")
print(f"{title.center(62)}")
print(f"{'=' * 62}{RESET}")
def run_command(cmd):
return subprocess.run(cmd, capture_output=True, text=True)
def check_dependencies():
print_section("DEPENDENCY CHECK")
dependencies = {
"exiftool": {
"macos": "brew install exiftool",
"linux": "sudo apt-get install -y libimage-exiftool-perl",
},
"qpdf": {
"macos": "brew install qpdf",
"linux": "sudo apt-get install -y qpdf",
},
}
all_ok = True
for dep in dependencies:
try:
result = run_command([dep, "--version"])
if result.returncode == 0:
version_output = (result.stdout or result.stderr).strip().splitlines()
version = version_output[0] if version_output else "installed"
print_success(f"{dep} found ({version})")
else:
all_ok = False
print_error(f"{dep} is installed but returned an error.")
except FileNotFoundError:
all_ok = False
print_error(f"{dep} is not installed.")
print(f" {BOLD}Install on macOS:{RESET} {dependencies[dep]['macos']}")
print(f" {BOLD}Install on Linux:{RESET} {dependencies[dep]['linux']}")
return all_ok
def remove_metadata_exiftool(input_pdf, output_pdf):
print_section("STEP 1 - METADATA REMOVAL")
print_info("Input PDF", input_pdf)
print_info("Temporary output", output_pdf)
try:
cmd = [
"exiftool",
"-all=",
"-TagsFromFile", "@",
"-Author", "-Title", "-Subject", "-Keywords",
"-Creator", "-Producer", "-CreationDate",
"-ModDate", "-Trapped", "-Encrypt",
"-o", output_pdf,
input_pdf,
]
result = run_command(cmd)
if result.returncode == 0:
print_success("Metadata successfully removed with exiftool.")
return True
print_error("Metadata removal failed.")
if result.stderr.strip():
print(result.stderr.strip())
return False
except Exception as e:
print_error(f"Unexpected exiftool error: {e}")
return False
def linearize_pdf_qpdf(input_pdf, output_pdf):
print_section("STEP 2 - PDF LINEARIZATION")
print_info("Input PDF", input_pdf)
print_info("Temporary output", output_pdf)
try:
cmd = [
"qpdf",
"--linearize",
"--object-streams=disable",
"--remove-unreferenced-resources=yes",
"--compress-streams=y",
"--coalesce-contents",
input_pdf,
output_pdf,
]
result = run_command(cmd)
if result.returncode == 0:
print_success("PDF successfully linearized with qpdf.")
return True
print_error("Linearization failed.")
if result.stderr.strip():
print(result.stderr.strip())
return False
except Exception as e:
print_error(f"Unexpected qpdf error: {e}")
return False
def verify_metadata_removal(pdf_path):
print_section("STEP 3 - VERIFICATION")
print_info("Checking file", pdf_path)
try:
result = run_command(["exiftool", pdf_path])
if result.returncode != 0:
print_error("Verification failed while running exiftool.")
if result.stderr.strip():
print(result.stderr.strip())
return False
output = result.stdout
critical_metadata = [
"Author",
"Creator",
"Producer",
"Creation Date",
"Modify Date",
"Title",
"Subject",
"Keywords",
"Trapped",
]
found_metadata = [field for field in critical_metadata if field in output]
if found_metadata:
print_warning(f"Remaining metadata found: {', '.join(found_metadata)}")
return False
print_success("No critical metadata found.")
return True
except Exception as e:
print_error(f"Verification error: {e}")
return False
def clean_pdf(input_path, output_path=None):
print_section("PDF CLEANING STARTED")
if not os.path.exists(input_path):
print_error(f"File not found: {input_path}")
return False
if output_path is None:
input_dir = os.path.dirname(input_path)
input_name = os.path.basename(input_path)
name, ext = os.path.splitext(input_name)
output_path = os.path.join(input_dir, f"{name}_cleaned{ext}")
print_info("Source file", input_path)
print_info("Output file", output_path)
fd1, tmp1_path = tempfile.mkstemp(suffix=".pdf")
os.close(fd1)
os.unlink(tmp1_path)
fd2, tmp2_path = tempfile.mkstemp(suffix=".pdf")
os.close(fd2)
os.unlink(tmp2_path)
try:
if not remove_metadata_exiftool(input_path, tmp1_path):
return False
if not linearize_pdf_qpdf(tmp1_path, tmp2_path):
return False
shutil.copy2(tmp2_path, output_path)
print_success(f"Cleaned PDF saved to: {output_path}")
verified = verify_metadata_removal(output_path)
print_section("FINAL RESULT")
if verified:
print_success("PDF successfully cleaned and verified.")
else:
print_warning("PDF cleaned, but verification found possible remaining metadata.")
return True
except Exception as e:
print_error(f"Error during cleaning process: {e}")
return False
finally:
for tmp_file in [tmp1_path, tmp2_path]:
if os.path.exists(tmp_file):
os.unlink(tmp_file)
def batch_clean_pdf(directory, recursive=False):
print_section("BATCH MODE")
directory = Path(directory)
if not directory.exists():
print_error(f"Directory not found: {directory}")
return
pattern = "**/*.pdf" if recursive else "*.pdf"
pdf_files = list(directory.glob(pattern))
if not pdf_files:
print_warning("No PDF files found.")
return
print_info("Directory", directory)
print_info("Recursive", recursive)
print_info("PDF files found", len(pdf_files))
success_count = 0
processed_count = 0
for pdf_file in pdf_files:
if "_cleaned" in pdf_file.stem:
print_warning(f"Skipping already cleaned file: {pdf_file.name}")
continue
processed_count += 1
output_file = pdf_file.parent / f"{pdf_file.stem}_cleaned.pdf"
print(f"\n{CYAN}{BOLD}--- Processing: {pdf_file.name} ---{RESET}")
if clean_pdf(str(pdf_file), str(output_file)):
success_count += 1
print_section("BATCH SUMMARY")
print_info("Eligible PDFs", processed_count)
print_info("Successfully cleaned", success_count)
print_info("Failed", processed_count - success_count)
def main():
parser = argparse.ArgumentParser(
description="Remove metadata from PDF files using exiftool and qpdf",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s document.pdf Clean a single file
%(prog)s document.pdf -o clean.pdf Specify output filename
%(prog)s -d ./pdfs Clean all PDFs in directory
%(prog)s -d ./pdfs -r Clean PDFs recursively
%(prog)s document.pdf -c Check metadata only
"""
)
parser.add_argument("input", nargs="?", help="Input PDF file")
parser.add_argument("-o", "--output", help="Output PDF file")
parser.add_argument("-d", "--directory", help="Directory containing PDF files")
parser.add_argument("-r", "--recursive", action="store_true",
help="Search subdirectories recursively")
parser.add_argument("-c", "--check", action="store_true",
help="Only check metadata, do not remove")
args = parser.parse_args()
print_banner()
if not check_dependencies():
sys.exit(1)
if args.check and args.input:
print_section("CHECK MODE")
print_info("Target file", args.input)
verify_metadata_removal(args.input)
sys.exit(0)
if args.directory:
batch_clean_pdf(args.directory, args.recursive)
elif args.input:
clean_pdf(args.input, args.output)
else:
parser.print_help()
if __name__ == "__main__":
main()