Source code for thoth.ingestion.parsers

"""Document parsers for multi-format ingestion.

This module provides a unified interface for parsing different document
formats (Markdown, PDF, plain text, Word documents).

Example:
    >>> from thoth.ingestion.parsers import ParserFactory
    >>> from pathlib import Path
    >>>
    >>> doc = ParserFactory.parse(Path("document.pdf"))
    >>> print(doc.content)
"""

from pathlib import Path
from typing import ClassVar

from thoth.ingestion.parsers.base import DocumentParser, ParsedDocument
from thoth.ingestion.parsers.docx import DocxParser
from thoth.ingestion.parsers.markdown import MarkdownParser
from thoth.ingestion.parsers.pdf import PDFParser
from thoth.ingestion.parsers.text import TextParser
from thoth.shared.utils.logger import setup_logger

logger = setup_logger(__name__)

__all__ = [
    "DocumentParser",
    "DocxParser",
    "MarkdownParser",
    "PDFParser",
    "ParsedDocument",
    "ParserFactory",
    "TextParser",
]



[docs]
class ParserFactory:
    """Factory for creating and using document parsers.

    This factory maintains a registry of available parsers and provides
    methods to parse files using the appropriate parser based on file
    extension.

    Example:
        >>> # Parse a single file
        >>> doc = ParserFactory.parse(Path("notes.md"))
        >>>
        >>> # Get parser for a specific file
        >>> parser = ParserFactory.get_parser(Path("document.pdf"))
        >>> if parser:
        ...     doc = parser.parse(Path("document.pdf"))
        >>>
        >>> # Check supported extensions
        >>> extensions = ParserFactory.supported_extensions()
        >>> print(extensions)  # ['.md', '.markdown', '.mdown', '.pdf', '.txt', ...]
    """

    # Registry of parser classes
    _parser_classes: ClassVar[list[type[DocumentParser]]] = [
        MarkdownParser,
        PDFParser,
        TextParser,
        DocxParser,
    ]

    # Cache of parser instances
    _parser_instances: ClassVar[dict[str, DocumentParser]] = {}


[docs]
    @classmethod
    def get_parser(cls, file_path: Path) -> DocumentParser | None:
        """Get appropriate parser for a file.

        Args:
            file_path: Path to the file to parse

        Returns:
            DocumentParser instance if a suitable parser exists, None otherwise
        """
        extension = file_path.suffix.lower()

        # Check cache first
        if extension in cls._parser_instances:
            return cls._parser_instances[extension]

        # Find and cache parser
        for parser_class in cls._parser_classes:
            parser = parser_class()
            if parser.can_parse(file_path):
                # Cache for all supported extensions
                for ext in parser.supported_extensions:
                    cls._parser_instances[ext.lower()] = parser
                return parser

        return None



[docs]
    @classmethod
    def parse(cls, file_path: Path) -> ParsedDocument:
        """Parse a file using the appropriate parser.

        Args:
            file_path: Path to the file to parse

        Returns:
            ParsedDocument with extracted content and metadata

        Raises:
            ValueError: If no parser is available for the file type
            FileNotFoundError: If file doesn't exist
        """
        parser = cls.get_parser(file_path)

        if parser is None:
            supported = cls.supported_extensions()
            msg = f"No parser available for '{file_path.suffix}'. Supported: {supported}"
            raise ValueError(msg)

        logger.debug("Parsing %s with %s", file_path, parser.name)
        return parser.parse(file_path)



[docs]
    @classmethod
    def parse_content(cls, content: bytes, source_path: str, extension: str) -> ParsedDocument:
        """Parse content bytes using a parser for the given extension.

        Args:
            content: Raw file content as bytes
            source_path: Original source path for metadata
            extension: File extension (e.g., '.pdf')

        Returns:
            ParsedDocument with extracted content and metadata

        Raises:
            ValueError: If no parser is available for the extension
        """
        # Create a fake path to find the right parser
        fake_path = Path(f"file{extension}")
        parser = cls.get_parser(fake_path)

        if parser is None:
            supported = cls.supported_extensions()
            msg = f"No parser available for '{extension}'. Supported: {supported}"
            raise ValueError(msg)

        logger.debug("Parsing content for %s with %s", source_path, parser.name)
        return parser.parse_content(content, source_path)



[docs]
    @classmethod
    def supported_extensions(cls) -> list[str]:
        """Get all supported file extensions.

        Returns:
            List of supported extensions including the dot (e.g., ['.md', '.pdf'])
        """
        extensions = []
        for parser_class in cls._parser_classes:
            parser = parser_class()
            extensions.extend(parser.supported_extensions)
        return sorted(set(extensions))



[docs]
    @classmethod
    def can_parse(cls, file_path: Path) -> bool:
        """Check if any parser can handle the given file.

        Args:
            file_path: Path to check

        Returns:
            True if a parser is available for the file
        """
        return cls.get_parser(file_path) is not None



[docs]
    @classmethod
    def register_parser(cls, parser_class: type[DocumentParser]) -> None:
        """Register a new parser class.

        Args:
            parser_class: Parser class to register
        """
        if parser_class not in cls._parser_classes:
            cls._parser_classes.append(parser_class)
            # Clear cache to include new parser
            cls._parser_instances.clear()
            logger.info("Registered parser: %s", parser_class.__name__)