import numpy as np

class DataStreamer(object):
    '''
    Load in data stream and preprocess each instance.
    '''


    def __init__(self, **kwargs):
        self.convert_dict = {1.0: 1.0}
        self.label_cand = [1.0, -1.0]
   
    def load(self, data_name, data_path):
        if data_name == 'HIGGS' or data_name == 'higgs':
            self.data_name = 'HIGGS'
            self.N, self.D = 11000000, 28 + 1
        elif data_name == 'pokerhand':
            self.data_name = 'pokerhand'
            self.data_path = data_path
            self.N = 1000000
            self.D = 25
        
        self.reset()
    
    def reset(self):
        if self.data_name == 'HIGGS' or self.data_name == 'higgs':
            self.f = open(self.data_path + self.data_name, 'r')
        elif self.data_name == 'pokerhand':
            self.f = open(self.data_path + 'poker-hand-testing.data', 'r')
        
    def next_instance(self):
        # obtain the next instance and pre-process

        if self.data_name == 'HIGGS' or self.data_name == 'higgs':
            rr = self.f.readline()
            r = rr.strip().split()
            y = self._adjust_labeling(float(r[0]))
            x = np.zeros(self.D)
            
            for ele in r[1:]:
                e = ele.split(':')
                x[int(e[0])-1] = float(e[1])
            x[-1] = 1.0
            
        elif self.data_name == 'pokerhand':
            r = self.f.readline().strip('\r').strip('\n').split(',')
            r = [int(rr) for rr in r]
            y = r[-1]
            x = np.zeros(self.D)
            x[np.arange(5)*4+r[:-1:2]] = 1
            x[20:] = np.array(r[1::2], dtype='float')/6-7.0/6
        
        return x, y
    
    def _adjust_labeling(self, y):
        try:
            return self.convert_dict[y]
        except KeyError:
            self.convert_dict[y] = self.label_cand.pop()
            return self.convert_dict[y] 
