"""Plain text document parser.
This module provides parsing for plain text files.
"""
from pathlib import Path
from thoth.ingestion.parsers.base import DocumentParser, ParsedDocument
from thoth.shared.utils.logger import setup_logger
logger = setup_logger(__name__)
[docs]
class TextParser(DocumentParser):
"""Parser for plain text files.
Supports:
- Plain text files (.txt, .text)
- UTF-8 encoding with fallback to latin-1
"""
@property
def supported_extensions(self) -> list[str]:
"""Return supported text extensions."""
return [".txt", ".text"]
[docs]
def parse(self, file_path: Path) -> ParsedDocument:
"""Parse a plain text file.
Args:
file_path: Path to the text file
Returns:
ParsedDocument with content
Raises:
FileNotFoundError: If file doesn't exist
"""
if not file_path.exists():
msg = f"File not found: {file_path}"
raise FileNotFoundError(msg)
content = file_path.read_bytes()
return self.parse_content(content, str(file_path))
[docs]
def parse_content(self, content: bytes, source_path: str) -> ParsedDocument:
"""Parse text content from bytes.
Args:
content: Raw file content as bytes
source_path: Original source path for metadata
Returns:
ParsedDocument with content
"""
try:
text = content.decode("utf-8")
except UnicodeDecodeError:
# Try with latin-1 as fallback
text = content.decode("latin-1")
logger.warning("File %s not valid UTF-8, used latin-1 fallback", source_path)
# Basic metadata
metadata = {
"source_path": source_path,
"char_count": len(text),
"line_count": text.count("\n") + 1,
}
return ParsedDocument(
content=text,
metadata=metadata,
source_path=source_path,
format="text",
)