import PyPDF2
import nltk
import re
from nltk.tokenize import sent_tokenize


# Download NLTK tokenizer model
nltk.download('punkt')
nltk.download('punkt_tab')


# 
# By using this file, you are agreeing to this product's EULA
#
# This product can be obtained in https://anonymous.4open.science/r/SAFE-ICLR
#
# Copyright ©2024-2025 XXXX-1
#


def tokenize(text):
	return sent_tokenize(text)

def clean_text(text):
	"""
	Perform basic text cleaning:
	- Remove line breaks
	- Handle hyphens splitting words across lines
	"""
	# Remove line breaks
	text = text.replace('\n', ' ').strip()
	# Remove hyphens at line breaks (e.g., "hyphen-\nated" -> "hyphenated")
	text = re.sub(r'-\s+', '', text)
	return text


def extract_sentences_from_pdf(pdf_path):
	# Open the PDF file
	with open(pdf_path, 'rb') as file:
		reader = PyPDF2.PdfReader(file)
		text = ""
		
		# Extract text from each page
		for page in reader.pages:
			text += page.extract_text()
	
	# Clean the text
	cleaned_text = clean_text(text)
	
	# Split the text into sentences
	sentences = tokenize(cleaned_text)
	return sentences






import os.path
from joblib import load, dump


import itertools


#
# Loads pdf and extracts a list to sentences to be saved in a .py file
#
def extractSentencesFromPDF(filename_in, filename_out):
	if not os.path.isfile(filename_out):
		with open( filename_in, "rb") as file:
			reader = PyPDF2.PdfReader(file)
			text = ""
			
			# Extract text from each page
			for page in reader.pages:
				text += page.extract_text()
		
		# Clean the text
		cleaned_text = clean_text(text)
		
		# Split the text into sentences
		sentences = tokenize(cleaned_text)

		dump(sentences, filename_out)
	else:
		sentences = load(filename_out)

	return sentences

def extractSentencesFromTXT(filename, recalculate = False):
	libfile = "precomputed/" + filename+".lib"
	if recalculate or not os.path.isfile(libfile):
		file = open( filename)
		text = file.readlines()
		file.close()

		# Clean the text
		cleaned_text = [clean_text(t) for t in text]
		
		# Split the text into sentences
		sentences = [tokenize(t) for t in cleaned_text]
		sent_join = []
		for s in sentences:
			if len(s)>0:
				sent_join.extend(s)
		dump(sent_join, libfile)
	else:
		sent_join = load(libfile)

	return sent_join