import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.LinkedList;
import java.util.List;

import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.Namespace;
import org.jdom2.input.SAXBuilder;
import org.semanticweb.yars.util.CallbackNQOutputStream;

import com.ontologycentral.ldspider.Crawler;
import com.ontologycentral.ldspider.frontier.BasicFrontier;
import com.ontologycentral.ldspider.frontier.Frontier;
import com.ontologycentral.ldspider.hooks.content.ContentHandler;
import com.ontologycentral.ldspider.hooks.content.ContentHandlerNx;
import com.ontologycentral.ldspider.hooks.content.ContentHandlerRdfXml;
import com.ontologycentral.ldspider.hooks.content.ContentHandlers;
import com.ontologycentral.ldspider.hooks.sink.Sink;
import com.ontologycentral.ldspider.hooks.sink.SinkCallback;

/**
 * Class for running ldSpider
 * 
 * @author Tobias Weller, AIFB
 *
 */
public class runldSpider {
	private String lastSeed;
	private LinkedList<URI> seedList;
	private LinkedList<String> dailyCrawlList;
	private Calendar c;
	
	/**
	 * Constructor Method.
	 */
	public runldSpider() {
		this.lastSeed = "";
		this.seedList = new LinkedList<URI>();
		this.dailyCrawlList = new LinkedList<String>();
		//Datum
		DateFormat df = new SimpleDateFormat("dd.MM.yyyy");
		this.c = df.getCalendar();
		this.c.setTimeInMillis(System.currentTimeMillis());
	}
	
	/**
	 * Get the latest Companies and the archives, which are published by the SEC
	 * 
	 * @throws JDOMException
	 * @throws IOException
	 * @throws URISyntaxException
	 */
	private void getLinks(String fileArchive) throws JDOMException, IOException, URISyntaxException {
		URL url = new URL("http://www.sec.gov/Archives/edgar/usgaap.rss.xml");
		SAXBuilder builder = new SAXBuilder();
		Document doc = builder.build(url);
		Element root = doc.getRootElement();
		List<Element> itemList = root.getChild("channel").getChildren("item");
		
		int i = 0;
		while (i < itemList.size() && i < 20) {
			String link = itemList.get(i).getChildText("link");
			String cik = link.split("http://www.sec.gov/Archives/edgar/data/")[1].split("/")[0];
			String archive = link.split("http://www.sec.gov/Archives/edgar/data/")[1].split("/")[2].split("-index")[0];
			if (archive.equalsIgnoreCase(this.lastSeed)) {
				i = itemList.size();
			} else {
				this.seedList.addLast(new URI("http://edgarwrap.ontologycentral.com/archive/" + cik + "/" + archive));
				this.seedList.addLast(new URI("http://edgarwrap.ontologycentral.com/cik/" + cik));
				//Greife auf Yahoo
				Namespace owl = Namespace.getNamespace("owl", "http://www.w3.org/2002/07/owl#");
				Namespace rdf = Namespace.getNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
				try {
				URL urlYahoo = new URL("http://yahoofinancewrap.appspot.com/cik/" + cik);
				SAXBuilder builderCorpWatch = new SAXBuilder();
				Document docYahoo = builderCorpWatch.build(urlYahoo);
				Element tick = docYahoo.getRootElement().getChildren("Description", rdf).get(1).getChild("sameAs", owl);
				String ticker = tick.getAttribute("ressource", rdf).getValue().split("../ticker/")[1].split("#")[0];
				this.seedList.addLast(new URI("http://yahoofinancewrap.appspot.com/ticker/" + ticker));
				this.dailyCrawlList.add(ticker);
				} catch (Exception e) {
					System.out.println("No Ticker Found for: " + cik);
				}
			}
			System.out.println(i + " Found CIK: " + cik);
			i++;
		}
		this.lastSeed = itemList.get(0).getChildText("link").split("http://www.sec.gov/Archives/edgar/data/")[1].split("/")[2].split("-index")[0];
		
		try {
			String zeile = "";
			BufferedReader file = new BufferedReader (new FileReader(fileArchive + "seed.txt"));
			while ((zeile = file.readLine()) != null) {
				this.seedList.addLast(new URI(zeile.trim()));
				if (zeile.contains("yahoofinancewrap.appspot.com/archive") || zeile.contains("yahoofinancewrap.appspot.com/ticker")) {
					this.dailyCrawlList.add(zeile.split("com/")[1].split("/")[1]);
				}
			}
			file.close();
			FileWriter writer = new FileWriter(new File(fileArchive + "seed.txt"), false);
			writer.flush();
			writer.close();
		} catch (FileNotFoundException e) {
			System.out.println("seed.txt Datei nicht gefunden");
		} catch (IOException e) {
			System.out.println("E/A-Fehler");
		}
		this.addDailyList();
		
	}

	/**
	 * Adds the URIs of the daily seed List to the crawler Seed List.
	 * 
	 * @throws URISyntaxException
	 */
	private void addDailyList() throws URISyntaxException {
		if (Calendar.DAY_OF_WEEK != 1) {
			int i = 0;
			while(i < this.dailyCrawlList.size()) {
				this.seedList.addLast(new URI("http://yahoofinancewrap.appspot.com/archive/" + this.dailyCrawlList.get(i) + "/" + this.c.get(Calendar.YEAR) + "-" + (this.c.get(Calendar.MONTH) + 1) + "-" + this.c.get(Calendar.DAY_OF_MONTH)));
				i++;
			}
		}	
	}
	
	/**
	 * Method, that initialized and runs the ldSpider
	 * 
	 * @param path Filepath, where the results should be saved
	 * @throws URISyntaxException
	 * @throws IOException
	 * @throws JDOMException
	 */
	public void run(String fileArchive) throws URISyntaxException, IOException, JDOMException {
		System.out.println("Path: " + fileArchive);
		this.getLinks(fileArchive);
		if (this.seedList.size() > 0) {
			Crawler crawler = new Crawler();
			Frontier frontier = new BasicFrontier();
			frontier.addAll(this.seedList);

			ContentHandler contentHandler = new ContentHandlers(new ContentHandlerRdfXml(), new ContentHandlerNx());
			crawler.setContentHandler(contentHandler);
			
			java.io.OutputStream os = new FileOutputStream(fileArchive + "upload/rdfData.txt");
			Sink sink = new SinkCallback(new CallbackNQOutputStream(os));
			crawler.setOutputCallback(sink);

			
			java.util.logging.Logger.getLogger("com.ontologycentral.ldspider").setLevel(java.util.logging.Level.WARNING);
			
			try {
				BufferedReader fileUploadSettings = new BufferedReader (new FileReader(fileArchive + "Settings_LDSpider.txt"));
				String zeile = fileUploadSettings.readLine();
				//Schema: <SEARCH>@<MAXURIs>@<DEPTH@X@Y>
				String[] settings = zeile.split("@");

				if (settings[0].equalsIgnoreCase("BREADTH")) {
					crawler.evaluateBreadthFirst(frontier, Integer.parseInt(settings[1]), Integer.parseInt(settings[2]), Integer.parseInt(settings[3]));
				} else {
					crawler.evaluateLoadBalanced(frontier, Integer.parseInt(settings[1]));
				}
				
				crawler.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			
			this.seedList.clear();
		}
		
		
	}
	
}
