import sqlite3
from collections import OrderedDict

import numpy as np

from config import DB_PATH
from dataset_metrics.extract_info_from_reviews import biases

SUSPICIOUS_WORDS = ['commendable', 'innovative', 'meticulous', 'intricate', 'notable', 'versatile']
STRUCTURE_WORDS = ['soundness', 'novelty', 'clarity', 'significance', 'strength', 'weakness']

def presence_per_bias(bias, words_to_check):
    with sqlite3.connect(str(DB_PATH)) as c:
        results = []
        for word in words_to_check:
            query = f"""
            select count(*) from genai_review
            where lower(generated) like "%{word}%" and type=?
            """
            results.append(c.execute(query, [bias]).fetchone()[0])
    return results


def occurrences_suspicious_words(bias, words_to_check):
    query = "select generated from genai_review where type=?"
    average_count = []
    total_count = OrderedDict([(word, 0) for word in words_to_check])
    with sqlite3.connect(str(DB_PATH)) as c:
        all_rows = c.execute(query,[bias]).fetchall()
        for word in words_to_check:
            count_per_reviews = [row[0].lower().count(word) for row in all_rows]
            total_count[word] += sum(count_per_reviews)
            average_count.append(np.average(count_per_reviews))
    return total_count, average_count

def compute_presence_per_wordlist(words):
    for b in biases:
        total_count, average_count = occurrences_suspicious_words(b, words)
        results = presence_per_bias(b, words)
        print(f"{b} bias result")
        for w, presence, occ, average in zip(words, results, total_count.values(), average_count):
            print(f"{w}: {presence} - {occ} - {average}")

if __name__ == '__main__':
    print("Suspicious words statistics")
    compute_presence_per_wordlist(SUSPICIOUS_WORDS)
    print("####")
    print("Structural words statistics")
    compute_presence_per_wordlist(STRUCTURE_WORDS)
    print("####")

