Source code for thoth.ingestion.parsers.docx

"""Word document parser.

This module provides parsing for Word documents using python-docx.
"""

from pathlib import Path
from typing import Any

from thoth.ingestion.parsers.base import DocumentParser, ParsedDocument
from thoth.shared.utils.logger import setup_logger

logger = setup_logger(__name__)


[docs] class DocxParser(DocumentParser): """Parser for Word documents using python-docx. Supports: - Word documents (.docx) - Paragraph text extraction - Basic metadata extraction (title, author) Note: Only supports .docx format (Office Open XML). Legacy .doc files are not supported. """ @property def supported_extensions(self) -> list[str]: """Return supported Word document extensions.""" return [".docx"]
[docs] def parse(self, file_path: Path) -> ParsedDocument: """Parse a Word document. Args: file_path: Path to the Word document Returns: ParsedDocument with extracted text and metadata Raises: FileNotFoundError: If file doesn't exist ImportError: If python-docx is not installed """ if not file_path.exists(): msg = f"File not found: {file_path}" raise FileNotFoundError(msg) try: from docx import Document # noqa: PLC0415 except ImportError as e: msg = "python-docx is required for Word document parsing. Install with: pip install python-docx" raise ImportError(msg) from e doc = Document(str(file_path)) return self._extract_document(doc, str(file_path))
[docs] def parse_content(self, content: bytes, source_path: str) -> ParsedDocument: """Parse Word document content from bytes. Args: content: Raw document content as bytes source_path: Original source path for metadata Returns: ParsedDocument with extracted text and metadata """ try: from docx import Document # noqa: PLC0415 except ImportError as e: msg = "python-docx is required for Word document parsing. Install with: pip install python-docx" raise ImportError(msg) from e import io # noqa: PLC0415 doc = Document(io.BytesIO(content)) return self._extract_document(doc, source_path)
def _extract_document(self, doc: Any, source_path: str) -> ParsedDocument: """Extract text and metadata from a Document object. Args: doc: python-docx Document object source_path: Original source path for metadata Returns: ParsedDocument with extracted content """ # Extract paragraphs paragraphs = [] for para in doc.paragraphs: text = para.text.strip() if text: paragraphs.append(text) # Extract text from tables for table in doc.tables: for row in table.rows: row_text = [] for cell in row.cells: cell_text = cell.text.strip() if cell_text: row_text.append(cell_text) if row_text: paragraphs.append(" | ".join(row_text)) # Extract metadata from core properties core_props = doc.core_properties metadata = { "source_path": source_path, "title": core_props.title or "", "author": core_props.author or "", "subject": core_props.subject or "", "keywords": core_props.keywords or "", "paragraph_count": len(paragraphs), } # Remove empty metadata values metadata = {k: v for k, v in metadata.items() if v} return ParsedDocument( content="\n\n".join(paragraphs), metadata=metadata, source_path=source_path, format="docx", )