import os
from typing import List, Dict
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
from .html_processor import HTMLProcessor


class TxtReader(BaseReader):
    def __init__(self, clean_html: bool = True):
           
        self.clean_html = clean_html
        self.html_processor = HTMLProcessor() if clean_html else None

    def load_data(self, file_path: str, extra_info: Dict = None, encoding='utf-8') -> List[Document]:
           
        documents = []

        try:
            with open(file_path, 'r', encoding=encoding) as f:
                content = f.read()

                      
            if self.clean_html and self.html_processor:
                original_length = len(content)
                content = self.html_processor.clean_html(content)
                cleaned_length = len(content)

                        
                            
                if original_length > 0 and (original_length - cleaned_length) > original_length * 0.1:
                    print(f"HTML cleaned: {original_length} -> {cleaned_length}")
                       
            title = os.path.basename(file_path).replace('.txt', '').replace('.md', '')

                   
            metadata = extra_info.copy() if extra_info else {}
            metadata.update({
                'file_path': file_path,
                'title': title,
                'file_type': 'txt' if file_path.endswith('.txt') else 'md',
                'file_size': len(content),
                'html_cleaned': self.clean_html
            })

                    
            document = Document(text=content, metadata=metadata)
            document.id_ = f"{title}"
            documents.append(document)

        except Exception as e:
            print(f"Error loading data from {file_path}: {e}")
        return documents
