Source code for thoth.ingestion.parsers

"""Document parsers for multi-format ingestion.

This module provides a unified interface for parsing different document
formats (Markdown, PDF, plain text, Word documents).

Example:
    >>> from thoth.ingestion.parsers import ParserFactory
    >>> from pathlib import Path
    >>>
    >>> doc = ParserFactory.parse(Path("document.pdf"))
    >>> print(doc.content)
"""

from pathlib import Path
from typing import ClassVar

from thoth.ingestion.parsers.base import DocumentParser, ParsedDocument
from thoth.ingestion.parsers.docx import DocxParser
from thoth.ingestion.parsers.markdown import MarkdownParser
from thoth.ingestion.parsers.pdf import PDFParser
from thoth.ingestion.parsers.text import TextParser
from thoth.shared.utils.logger import setup_logger

logger = setup_logger(__name__)

__all__ = [
    "DocumentParser",
    "DocxParser",
    "MarkdownParser",
    "PDFParser",
    "ParsedDocument",
    "ParserFactory",
    "TextParser",
]


[docs] class ParserFactory: """Factory for creating and using document parsers. This factory maintains a registry of available parsers and provides methods to parse files using the appropriate parser based on file extension. Example: >>> # Parse a single file >>> doc = ParserFactory.parse(Path("notes.md")) >>> >>> # Get parser for a specific file >>> parser = ParserFactory.get_parser(Path("document.pdf")) >>> if parser: ... doc = parser.parse(Path("document.pdf")) >>> >>> # Check supported extensions >>> extensions = ParserFactory.supported_extensions() >>> print(extensions) # ['.md', '.markdown', '.mdown', '.pdf', '.txt', ...] """ # Registry of parser classes _parser_classes: ClassVar[list[type[DocumentParser]]] = [ MarkdownParser, PDFParser, TextParser, DocxParser, ] # Cache of parser instances _parser_instances: ClassVar[dict[str, DocumentParser]] = {}
[docs] @classmethod def get_parser(cls, file_path: Path) -> DocumentParser | None: """Get appropriate parser for a file. Args: file_path: Path to the file to parse Returns: DocumentParser instance if a suitable parser exists, None otherwise """ extension = file_path.suffix.lower() # Check cache first if extension in cls._parser_instances: return cls._parser_instances[extension] # Find and cache parser for parser_class in cls._parser_classes: parser = parser_class() if parser.can_parse(file_path): # Cache for all supported extensions for ext in parser.supported_extensions: cls._parser_instances[ext.lower()] = parser return parser return None
[docs] @classmethod def parse(cls, file_path: Path) -> ParsedDocument: """Parse a file using the appropriate parser. Args: file_path: Path to the file to parse Returns: ParsedDocument with extracted content and metadata Raises: ValueError: If no parser is available for the file type FileNotFoundError: If file doesn't exist """ parser = cls.get_parser(file_path) if parser is None: supported = cls.supported_extensions() msg = f"No parser available for '{file_path.suffix}'. Supported: {supported}" raise ValueError(msg) logger.debug("Parsing %s with %s", file_path, parser.name) return parser.parse(file_path)
[docs] @classmethod def parse_content(cls, content: bytes, source_path: str, extension: str) -> ParsedDocument: """Parse content bytes using a parser for the given extension. Args: content: Raw file content as bytes source_path: Original source path for metadata extension: File extension (e.g., '.pdf') Returns: ParsedDocument with extracted content and metadata Raises: ValueError: If no parser is available for the extension """ # Create a fake path to find the right parser fake_path = Path(f"file{extension}") parser = cls.get_parser(fake_path) if parser is None: supported = cls.supported_extensions() msg = f"No parser available for '{extension}'. Supported: {supported}" raise ValueError(msg) logger.debug("Parsing content for %s with %s", source_path, parser.name) return parser.parse_content(content, source_path)
[docs] @classmethod def supported_extensions(cls) -> list[str]: """Get all supported file extensions. Returns: List of supported extensions including the dot (e.g., ['.md', '.pdf']) """ extensions = [] for parser_class in cls._parser_classes: parser = parser_class() extensions.extend(parser.supported_extensions) return sorted(set(extensions))
[docs] @classmethod def can_parse(cls, file_path: Path) -> bool: """Check if any parser can handle the given file. Args: file_path: Path to check Returns: True if a parser is available for the file """ return cls.get_parser(file_path) is not None
[docs] @classmethod def register_parser(cls, parser_class: type[DocumentParser]) -> None: """Register a new parser class. Args: parser_class: Parser class to register """ if parser_class not in cls._parser_classes: cls._parser_classes.append(parser_class) # Clear cache to include new parser cls._parser_instances.clear() logger.info("Registered parser: %s", parser_class.__name__)