
package edu.hit.ir.xmlAnalysis;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.util.List;
import java.util.LinkedList;
import java.util.ArrayList;
//import java.awt.List;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.Format;

import org.jdom.input.SAXBuilder;
import org.jdom.output.XMLOutputter;
import org.jdom.Element;
import org.jdom.Document;
import org.jdom.JDOMException;


/**
 * <p>Title: XmlAnalysis</p>
 * <p>Description: 
 * 		special for ltp xml;
 * </p>
 * <p>Organization: HIT-CIR</p>
 * @author    Zhonghua Han (zhhan@ir.hit.edu.cn)
 * @date       Nov 10, 2009
 *
 */
public class XmlAnalysis {
	private SAXBuilder xml;
	private Document doc;
	private Element root;
	private ArrayList<ArrayList> pIndex = new ArrayList<ArrayList>();	//paragraph index
	private ArrayList<ArrayList<Word>> sIndex = new ArrayList<ArrayList<Word>>();	//sentence index

	public XmlAnalysis() {
		super();
		xml = new SAXBuilder();
	}
	
	public void build(File file) throws JDOMException, IOException {
		doc = xml.build(file);
		root = doc.getRootElement().getChild("doc");
		initIndex();
	}	
	
	public void build(String sourceStr)
			throws JDOMException, IOException {
//		System.out.println("here");
		doc = xml.build(new ByteArrayInputStream(sourceStr.getBytes()));
		root = doc.getRootElement().getChild("doc");
		initIndex();
	}

	private void initIndex(){
		pIndex.clear();
		sIndex.clear();
		for(int i = 0; i<root.getChildren().size(); ++i){
			Element pele = (Element)root.getChildren().get(i);
			ArrayList<ArrayList> sents = new ArrayList<ArrayList>();
			for(int j = 0; j<pele.getChildren().size(); ++j){
				Element sele = (Element)pele.getChildren().get(j);
				ArrayList<Word> wds = new ArrayList<Word>();
				for(int p = 0; p<sele.getChildren().size(); ++p){
					wds.add(new Word((Element)sele.getChildren().get(p)));
				}
				if(!wds.isEmpty()){
					sents.add(wds);
					sIndex.add(wds);
				}
			}
			if(!sents.isEmpty()){
				pIndex.add(sents);
			}
		}
	}
	
	/**
	 * <p>Title: cloneWordList</p>
	 * <p>Description: word list deep copy</p>
	 * @param wordList
	 * @return
	 */
	private ArrayList<Word> cloneWordList(ArrayList<Word> wordList){
		ArrayList<Word> alw = new ArrayList<Word>();
		if(wordList!= null && !wordList.isEmpty()){
			for(int i = 0; i< wordList.size(); ++i){
				try {
					alw.add((Word) wordList.get(i).clone());
				} catch (CloneNotSupportedException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		}
		return alw;
	}
	
	/*	
	public void build(String systemId) throws JDOMException, IOException {
		doc = xml.build(systemId);
	}
//*/
	public void outputters(){
		XMLOutputter outputter = new XMLOutputter();//Format.getPrettyFormat());
        try {
//        	Document doc = new Document();
        	outputter.output(doc, System.out);
//            outputter.output(doc, new FileOutputStream(new File("ltp.xml")));
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } 
	}
	public boolean isWs(){	
		return doc.getRootElement().getChild("note").getAttributeValue("word").equals("y");
	}
	public boolean isPos(){
		return doc.getRootElement().getChild("note").getAttributeValue("pos").equals("y");
	}
	public boolean isNer(){
		return doc.getRootElement().getChild("note").getAttributeValue("ne").equals("y");
	}
	public boolean isDp(){
		return doc.getRootElement().getChild("note").getAttributeValue("parser").equals("y");
	}
	public boolean isWsd(){
		return doc.getRootElement().getChild("note").getAttributeValue("wsd").equals("y");
	}
	public boolean isSrl(){
		return doc.getRootElement().getChild("note").getAttributeValue("srl").equals("y");
	}

	/**
	 * <p>Title: SaveDom</p>
	 * <p>Description: 
	 * 		save the dom tree into a file;
	 * </p>
	 * @param filename
	 */
	public void SaveDom(String filename){
		XMLOutputter outputter = new XMLOutputter();//Format.getPrettyFormat());
        try {
        	outputter.output(doc, new FileOutputStream(filename));
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } 
	}

	/**
	 * <p>Title: getDom</p>
	 * <p>Description: 
	 * 		get the dom tree;
	 * </p>
	 * @return 
	 */
	public Document getDom(){
		return doc;
	}
	
	/**
	 * <p>Title: CountParagraphInDocument</p>
	 * @return the number of paragraphs in the document;
	 */
	public int CountParagraphInDocument(){		
//		return doc.getRootElement().getChild("doc").getChildren().size();
		return pIndex.size();
	}

	/**
	 * <p>Title: CountSentenceInParagraph</p>
	 * @param paragraphIdx
	 * @return the number of sentences in a special paragraph
	 */
	public int CountSentenceInParagraph(int paragraphIdx){
//		return ((Element) doc.getRootElement().getChild("doc").getChildren("para").get(paragraphIdx)).getChildren().size();
		return pIndex.get(paragraphIdx).size();
	}
	
	/**
	 * <p>Title: CountSentenceInDocument</p>
	 * @return the number of sentences in the whole document;
	 */
	public int CountSentenceInDocument(){
		/*
		int counts = 0;
		List paras = doc.getRootElement().getChild("doc").getChildren("para");
		for(int i=0; i<paras.size(); ++i){
			counts += ((Element) paras.get(i)).getChildren().size();
		}
		return counts;
		//*/
		return sIndex.size();
	}
	
	/**
	 * <p>Title: CountWordInSentence</p>
	 * <p>Description: 
	 * 		according to the index of paragraph and sentence;
	 * </p>
	 * @param paragraphIdx
	 * @param sentenceIdx
	 * @return the word number in a special sentence;
	 */
	public int CountWordInSentence(int paragraphIdx, int sentenceIdx){
		/*
		return 
			((Element)
					(
							((Element)(root.getChildren().get(paragraphIdx))
							).getChildren().get(sentenceIdx)
					)
			).getChildren().size();
			//*/
		return ((ArrayList<Word>)pIndex.get(paragraphIdx).get(sentenceIdx)).size();
	}
	
	/**
	 * <p>Title: CountWordInSentence</p>
	 * <p>Description: 
	 * 		according to the index of global sentence id;
	 * </p>
	 * @param globalSentIdx
	 * @return the word number in a special sentence;
	 */
	public int CountWordInSentence(int globalSentIdx){
		/*
		List paras = root.getChildren("para");
		for( int i= 0; i<paras.size(); ++i){
			List sents = ((Element)paras.get(i)).getChildren();
			if(globalSentIdx < sents.size()){
				return ((Element)sents.get(globalSentIdx)).getChildren().size();
			}else{
				globalSentIdx -= sents.size();
			}
		}
		return 0;
		//*/
		return sIndex.get(globalSentIdx).size();
	}
	//*/
	
	/**
	 * <p>Title: CountWordInParagraph</p>
	 * <p>Description: </p>
	 * @param paragraphIdx
	 * @return the word number in a special paragraph;
	 */
	public int CountWordInParagraph(int paragraphIdx){
		//*
		int counts = 0;
		List sents = ((Element)root.getChildren().get(paragraphIdx)).getChildren();
		for(int i = 0; i< sents.size(); i++){
			counts += ((Element)sents.get(i)).getChildren().size();
		}
		return counts;
		//*/		
	}
	
	/**
	 * <p>Title: CountWordInDocument</p>
	 * @return the word number in the whole document;
	 */
	public int CountWordInDocument(){
		int counts = 0;
		/*
		List paras = root.getChildren();
//		System.out.println("paras size():" + paras.size());
		for( int i= 0; i<paras.size(); ++i){
			List sents = ((Element)paras.get(i)).getChildren("sent");
//			System.out.println("	sents.size():" + sents.size());
			for( int j= 0; j<sents.size(); ++j){
//				System.out.println("		word.size():" + ((Element)sents.get(j)).getChildren("word").size());
				counts += ((Element)sents.get(j)).getChildren("word").size();
			}
		}
		//*/
		for(int i = 0; i<sIndex.size(); ++i){
			counts += sIndex.get(i).size();
		}
		return counts;
		
	}	
	/*
	public Paragraph getParagraph(int paragraphId){
		return (Paragraph)root.getChildren().get(paragraphId);
	}
	
	public List<Paragraph> getParagraphs(){
		return (List<Paragraph>)root.getChildren();
	}
	//*/
	/**
	 * <p>Title: getWord</p>
	 * <p>Description: 
	 * 		according to the index of paragraph, sentence and word;
	 * </p>
	 * @param paragraphId
	 * @param sentenceId
	 * @param wordId
	 * @return get the type of Word;
	 */
	public Word getWord(int paragraphId, int sentenceId, int wordId) {
/*
		return new Word((Element) ((Element) ((Element) root.getChildren().get(
				paragraphId)).getChildren().get(sentenceId)).getChildren().get(
				wordId));
				//*/
		Word wd = null;
		try {
			wd = (Word)((ArrayList<Word>)pIndex.get(paragraphId).get(sentenceId)).get(wordId).clone();
		} catch (CloneNotSupportedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return wd;
	}
	
	/**
	 * <p>Title: getWords</p>
	 * <p>Description: 
	 * 		according to the index of paragraph and sentence;
	 * </p>
	 * @param paragraphId
	 * @param sentenceId
	 * @return ArrayList<Word>;
	 */
	public ArrayList<Word> getWords(int paragraphId, int sentenceId){
		/*
		List<Word> wordList = new LinkedList<Word>();
		for( int i = 0; i<CountWordInSentence(paragraphId, sentenceId); ++i){
			wordList.add(getWord(paragraphId, sentenceId, i));
		}
		return wordList;
		//*/
		return cloneWordList((ArrayList<Word>) ((ArrayList<Word>)pIndex.get(paragraphId).get(sentenceId)));
	}
	
	/**
	 * <p>Title: getWord</p>
	 * <p>Description: 
	 * 		according to the index of global word id;
	 * 		this method is not advocated;
	 * </p>
	 * @param globalWordId
	 * @return get the type of Word;
	 */
	public Word getWord(int globalWordId){
		/*
		List paras = root.getChildren("para");
		for( int i= 0; i<paras.size(); ++i){
			List sents = ((Element)paras.get(i)).getChildren();
			for( int j = 0; j<sents.size(); ++j){
				List words = ((Element)sents.get(j)).getChildren();
				if(globalWordId < words.size()){
					return new Word((Element)words.get(globalWordId));
				}else{
					globalWordId -= words.size();
				}
			}
		}
		return null;
		//*/
		Word wd = null;
		for(int i = 0; i<sIndex.size(); ++i){
			if(globalWordId < sIndex.get(i).size()){
				try {
					wd = (Word) sIndex.get(i).get(globalWordId).clone();
				} catch (CloneNotSupportedException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				break;
			}else{
				globalWordId -= sIndex.get(i).size();
			}
		}
		return wd;
	}
	
	/**
	 * <p>Title: getWordsFromSentence</p>
	 * <p>Description: 
	 * 		according to the index of global sentence id;
	 * </p>
	 * @param globalSentenceId
	 * @return	ArrayList<Word>;
	 */
	public ArrayList<Word> getWordsFromSentence(int globalSentenceId){
		/*
		List paras = root.getChildren("para");
		
		for(int i=0; i<paras.size(); ++i){
			int counts = ((Element) paras.get(i)).getChildren().size();
			if( globalSentenceId <counts){
				Element sents= (Element)((Element) paras.get(i)).getChildren().get(globalSentenceId);
				List<Word> wordList = new LinkedList<Word>();
				for(int j = 0; j<sents.getChildren().size(); ++j){
					wordList.add(new Word((Element)sents.getChildren().get(j)));
				}
				return wordList;
			}
		}
		return null;
		//*/
		return cloneWordList((ArrayList<Word>) sIndex.get(globalSentenceId).clone());
		
	}
	
	/**
	 * <p>Title: getSentenceContent</p>
	 * <p>Description: 
	 * 		according to the index of paragraph and sentence;
	 * </p>
	 * @param paragraphIdx
	 * @param sentenceIdx
	 * @return the content of a sentence;
	 */
	public String getSentenceContent(int paragraphIdx, int sentenceIdx){		
		return 
		((Element)
				(
						((Element)(root.getChildren().get(paragraphIdx))
						).getChildren().get(sentenceIdx)
				)
		).getAttributeValue("cont");
	}
	
	/**
	 * <p>Title: getSentenceContent</p>
	 * <p>Description: 
	 * 		according to the index of global sentence id;
	 * </p>
	 * @param globalSentIdx
	 * @return the content of a sentence; 
	 */
	public String getSentenceContent(int globalSentIdx){
		for(int i = 0; i<CountParagraphInDocument(); ++i){
				if(globalSentIdx < CountSentenceInParagraph(i)){
					return getSentenceContent(i, globalSentIdx);
				}else{
					globalSentIdx -= CountSentenceInParagraph(i);
				}
		}
		return null;
	}
	
}
