Source code for thoth.ingestion.parsers.pdf

"""PDF document parser.

This module provides parsing for PDF files using PyMuPDF (fitz).
"""

from pathlib import Path

from thoth.ingestion.parsers.base import DocumentParser, ParsedDocument
from thoth.shared.utils.logger import setup_logger

logger = setup_logger(__name__)


[docs] class PDFParser(DocumentParser): """Parser for PDF files using PyMuPDF. Supports: - PDF files (.pdf) - Text extraction with page numbers - Basic metadata extraction (title, author, page count) """ @property def supported_extensions(self) -> list[str]: """Return supported PDF extensions.""" return [".pdf"]
[docs] def parse(self, file_path: Path) -> ParsedDocument: """Parse a PDF file. Args: file_path: Path to the PDF file Returns: ParsedDocument with extracted text and metadata Raises: FileNotFoundError: If file doesn't exist ImportError: If PyMuPDF is not installed """ if not file_path.exists(): msg = f"File not found: {file_path}" raise FileNotFoundError(msg) content = file_path.read_bytes() return self.parse_content(content, str(file_path))
[docs] def parse_content(self, content: bytes, source_path: str) -> ParsedDocument: """Parse PDF content from bytes. Args: content: Raw PDF content as bytes source_path: Original source path for metadata Returns: ParsedDocument with extracted text and metadata """ try: import fitz # PyMuPDF # noqa: PLC0415 except ImportError as e: msg = "PyMuPDF is required for PDF parsing. Install with: pip install PyMuPDF" raise ImportError(msg) from e # Open PDF from bytes doc = fitz.open(stream=content, filetype="pdf") try: text_parts = [] for page_num in range(len(doc)): page = doc[page_num] text = page.get_text() if text.strip(): # Add page marker for context text_parts.append(f"[Page {page_num + 1}]\n{text}") # Extract metadata pdf_metadata = doc.metadata or {} metadata = { "source_path": source_path, "page_count": len(doc), "title": pdf_metadata.get("title", ""), "author": pdf_metadata.get("author", ""), "subject": pdf_metadata.get("subject", ""), "creator": pdf_metadata.get("creator", ""), "producer": pdf_metadata.get("producer", ""), } # Remove empty metadata values metadata = {k: v for k, v in metadata.items() if v} combined_text = "\n\n".join(text_parts) return ParsedDocument( content=combined_text, metadata=metadata, source_path=source_path, format="pdf", ) finally: doc.close()