Today I needed to convert a PDF to Markdown on macOS. I tried CLIs I could install with brew, but none of them gave me decent output for structure and formatting. marker-pdf was the first one that consistently gave me clean Markdown, even though it required more work to set up.
I used this convert.py script:
#!/usr/bin/env python3
import subprocess
from pathlib import Path
def convert_pdf_to_markdown(pdf_file: str) -> None:
pdf_path = Path(pdf_file).expanduser().resolve()
if not pdf_path.exists():
raise FileNotFoundError(f"PDF not found: {pdf_path}")
root_dir = pdf_path.parent
venv_dir = root_dir / ".venv"
output_dir = root_dir / "output"
python_bin = venv_dir / "bin" / "python"
pip_bin = venv_dir / "bin" / "pip"
marker_bin = venv_dir / "bin" / "marker_single"
def run(cmd: list[str]) -> None:
print("+", " ".join(str(c) for c in cmd))
subprocess.run(cmd, check=True)
if not python_bin.exists():
run(["python3", "-m", "venv", str(venv_dir)])
if not marker_bin.exists():
run([str(pip_bin), "install", "marker-pdf"])
output_dir.mkdir(parents=True, exist_ok=True)
run(
[
str(marker_bin),
str(pdf_path),
"--output_format",
"markdown",
"--output_dir",
str(output_dir),
]
)
if __name__ == "__main__":
convert_pdf_to_markdown("path/to/the/pdf/you/want/to/convert.pdf")
Then run:
cd path/to/the/folder/with/convert.py
chmod +x convert.py
./convert.py
Output goes to:
path/to/the/pdf/folder/output/<pdf-name>/<pdf-name>.md
First run will be slower because it creates .venv, installs marker-pdf, and downloads model files.