



























import logging
import re

from .enums import ProbingState

INTERNATIONAL_WORDS_PATTERN = re.compile(
    b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
)


class CharSetProber:

    SHORTCUT_THRESHOLD = 0.95

    def __init__(self, lang_filter=None):
        self._state = None
        self.lang_filter = lang_filter
        self.logger = logging.getLogger(__name__)

    def reset(self):
        self._state = ProbingState.DETECTING

    @property
    def charset_name(self):
        return None

    def feed(self, byte_str):
        raise NotImplementedError

    @property
    def state(self):
        return self._state

    def get_confidence(self):
        return 0.0

    @staticmethod
    def filter_high_byte_only(buf):
        buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
        return buf

    @staticmethod
    def filter_international_words(buf):
        
        filtered = bytearray()

        
        
        
        words = INTERNATIONAL_WORDS_PATTERN.findall(buf)

        for word in words:
            filtered.extend(word[:-1])

            
            
            
            
            last_char = word[-1:]
            if not last_char.isalpha() and last_char < b"\x80":
                last_char = b" "
            filtered.extend(last_char)

        return filtered

    @staticmethod
    def remove_xml_tags(buf):
        
        filtered = bytearray()
        in_tag = False
        prev = 0
        buf = memoryview(buf).cast("c")

        for curr, buf_char in enumerate(buf):
            
            if buf_char == b">":
                prev = curr + 1
                in_tag = False
            elif buf_char == b"<":
                if curr > prev and not in_tag:
                    
                    
                    filtered.extend(buf[prev:curr])
                    
                    filtered.extend(b" ")
                in_tag = True

        
        if not in_tag:
            
            
            filtered.extend(buf[prev:])

        return filtered
