/*
 * Copyright (C) 2008 by Instytut Podstaw Informatyki Polskiej
 * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish
 * Academy of Sciences; cf. www.ipipan.waw.pl).  All rights reserved.
 *
 * This file is part of Spejd.
 *
 * Spejd is free software: it may be distributed and/or modified under 
 * the terms of the GNU General Public License version 3 as published 
 * by the Free Software Foundation and appearing in the file doc/gpl.txt
 * included in the packaging of this file.
 *
 * A commercial license is available from IPI PAN (contact
 * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more
 * information).  Licensees holding a valid commercial license from IPI
 * PAN may use this file in accordance with that license.
 *
 * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING
 * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE.
 */ 

package ipipan.spejd.readers;

import ipipan.spejd.entities.Entity;
import ipipan.spejd.entities.Interpretation;
import ipipan.spejd.entities.Segment;
import ipipan.spejd.entities.NoSpace;
import ipipan.spejd.util.Config;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Reader;
import java.nio.charset.Charset;

import java.util.HashSet;
import java.util.Collection;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;

import morfologik.stemmers.Stempelator;

/**
 *
 * @author axw
 */
public class PlainTextReader extends ipipan.spejd.readers.Reader {
    

    private Stempelator stemmer;
    
    private String fileContents; // loaded file 
    private String lastWord; // to distinguish acronyms from real sentence endings
    private int position; // where in the file is our reader?
    private boolean haveMoreSentences; // flag - are we at the end of file?
    private boolean inNoSpace;  // flag - should we return NoSpace during the next invocation?
    private boolean inEndOfSentence = false; // flag - are we at the end of sentence?
    private boolean atBeginningOfFile = true; // flag for XML printing

    public int ogonkified = 0;
    public int ogonkified_total = 0;
        
    // FIXME - the method should be probably moved to Util

    boolean isPunctuation(char ch) {
       int type = Character.getType(ch);
       
       return type == Character.DASH_PUNCTUATION || 
          type == Character.START_PUNCTUATION ||
          type == Character.END_PUNCTUATION ||
          type == Character.CONNECTOR_PUNCTUATION ||
          type == Character.OTHER_PUNCTUATION || 
          type == Character.FINAL_QUOTE_PUNCTUATION;
    }
    
    boolean isSentenceBreak(char ch) {
       return ch == '.' ||
           ch == '?' ||
           ch == '!';
    }
    
    public PlainTextReader(Config conf, Stempelator stemmer) {
        this.conf = conf;
        this.stemmer = stemmer;
        position = 0;
        haveMoreSentences = true;
    }
    
    
    /**
     * Reads the content of the text file and returns it as a string
     * @return The content of the file.
     * @param fileName The name of the file to read.
     * @throws marker.Error When somethings goes wrong with the reading.
     */
    public String readTextFile(String fileName, Charset inputCharset)
        throws Error
    {

        haveMoreSentences = true;
        position = 0;
        atBeginningOfFile = true; // to print XML at the next invocation of nextSentence

        fileContents ="";
        try
        {
            int BUFFER_SIZE=16384;
            int count;
            char buffer[]=new char[BUFFER_SIZE];
            
            System.setProperty("file.encoding", inputCharset.name());
            FileInputStream fis = new FileInputStream(fileName);
            Reader r = new InputStreamReader(fis, inputCharset);
            while (true)
            {
                count=r.read(buffer, 0, BUFFER_SIZE);
                if (count<0)
                    break;
                fileContents=fileContents+new String(buffer, 0, count);
            }
            fis.close();
            r.close();

        }
        catch (IOException ioe)
        {
            throw new Error("Error reading text file: "+ioe.getMessage());
        }
        return fileContents;
    }
    
    
    /**
     * Splits multiple interpretations written in one string in to seperate strings
     * @param interpretation The interpretation to be split.
     * @return A collection of strings - the seperate (sub)interpretations.
     */
    private Collection<String> splitInterpretation(String interpretation) 
    {
        Collection<String> retVal = new HashSet<String>();
        
        /* - split by '|' char */
        String[] parts = interpretation.split("[|]");

        for (int i=0; i<parts.length; i++)
        {
            String[] colons = parts[i].split("[:]");
            iterateAndAdd(colons, 0, retVal, "");
        }
        
        return retVal;
    }
    
    /**
     * Iterates over a table of interpretation parts and collects a cartesian product of possible interpretations
     */
    private void iterateAndAdd(String[] table, int index, Collection<String>retVal, String soFar)
    {
       if (index>=table.length) 
           retVal.add(soFar.substring(1));      /* adds without the leading ":" */
       else
       {
            String[] dots = table[index].split("[.]");
            for (int j=0; j<dots.length; j++)
                iterateAndAdd(table,index+1, retVal, soFar+":"+dots[j]);
       }
    }
    
    
    /**
     * Generate combinations of oginkified words, pick those that morfeusz understands
     * @param orth baseform not recognized
     * @return 
     */
    private LinkedList<Interpretation> ogonkify(String orth) {
        List<String> candidates = generateCandidates(orth);
        LinkedList<Interpretation> new_interpretations = new LinkedList<Interpretation>();

        for(String candidate : candidates)
        {
          
          String[] interps = stemmer.stemAndForm(candidate.toLowerCase(new Locale("pl", "PL")));
            
          if (interps != null)
          {  /* there are some interpretations */
            for (int j=0; j<interps.length-1; j+=2)
            {  /* for each returned interpretation */
              if (interps[j]!=null)
              if (interps[j+1]!=null)
              {
                  ogonkified++; ogonkified_total++;
                   for (String interp_tags : interps[j+1].split("\\+")) {
                      for (String ftags : conf.tagset.cToFtagArray(interp_tags)) {
                          new_interpretations.add( new Interpretation(ftags + interps[j], false, conf) );
                      }
                   }
              }
            }
          }
        }
        
        // subtract the surface form
        if (candidates.size()>1) {  ogonkified--; ogonkified_total--; }
        return new_interpretations;
    }
    
    /**
     * Utility to see what word starts from position in domain.com
     * @param pos start word construction here
     * @return word of up to 4 alpha characters as seen starting from pos 
     */
    public String getNextToken(int pos) {

        int startfrom = pos;
        StringBuilder sb = new StringBuilder();
        Character ch;
        // go forward max 4 chars
        while (pos<fileContents.length() && Character.isLetter((ch=fileContents.charAt(pos))) && startfrom+5 > pos) {
            sb.append(ch);
            pos++;
        }
        return sb.toString();
    }
    
    
    /**
     * Move to another word and run Morfeusz over it. Method returns null if end of sentence has been reached.
     * @return next segment with interpretations polled from fileContents String
     */
    public Entity loadToken() {

        // word form as it appeared in text
        String orth = "";

        Character ch;
        boolean inWord = false;


        tokenReading:
        while( position < fileContents.length() ) {

            if (inEndOfSentence) {
                inEndOfSentence = false;
                return null; // end of sentence)
            }

            ch=fileContents.charAt(position);
            
            if (Character.isWhitespace(ch)) {
                inNoSpace = false;
            } else {
                if (inNoSpace) {
                    inNoSpace = false;
                    return new NoSpace(conf);
                }
            }
            position++;
            
            // letter?
            if (Character.isLetterOrDigit(ch)) {
                if (!inWord) inWord= true;
                orth = orth+ch;
            } else
            // punctuation - return as a segment
            if(isPunctuation(ch) && !isSentenceBreak(ch)) {
                if (inWord) {
                    inNoSpace = true;
                    position--;
                    break tokenReading;
                } else {
                    Interpretation[] punct_interp = new Interpretation[1];
                    punct_interp[0] = new Interpretation(conf.tagset.cToFtag("interp") +  ch.toString(), true, conf);
                    inNoSpace = true;
                    return new Segment(null, ch.toString(), punct_interp, conf);
                }

            } else
           // could be a sentence break?
            if (isSentenceBreak(ch)) {
                inNoSpace = true;
                // we're in a word, process this word and later come back for the "."
                if (inWord && orth.length()>0) {
                    position--;
                    break tokenReading;


                } else {
                    boolean nextSentence = false;
                    // compare known abbreviations against the previous word
                    if (lastWord!=null) {
                        nextSentence = true;
                        if(ch=='.')
                        if(conf.acronymsAfter.contains(lastWord.toLowerCase()))
                            nextSentence = false;
                    }
                    // checking for ...
                    if (nextSentence) {
                        if (fileContents.length()>position+1) {
                            char cr = fileContents.charAt(position);
                            if (isSentenceBreak(cr)) {
                                nextSentence = false;
                            }
                        } else
                            nextSentence = false;
                    }
                    // check next word
                    if (nextSentence) {
                        String nextTok = getNextToken(position);
                        if (nextTok!=null) {
                            if(ch=='.')
				if(conf.acronymsBefore.contains(nextTok.toLowerCase()))
				    nextSentence = false;
                        }
                    }
                    if (nextSentence)
                    {
                        lastWord = null;
                        inEndOfSentence = true;
                    } else
                        inEndOfSentence = false;
                    Interpretation[] punct_interp = new Interpretation[1];
                    punct_interp[0] = new Interpretation(conf.tagset.cToFtag("interp") +  ch.toString(), true, conf);
                    return new Segment(null, ch.toString(), punct_interp, conf);
                }

            // anything else
            } else {
                // hand over the word for processing
                if (inWord) {
                    break tokenReading;
                }
            }
        }
        // process the current token but no more sententes
        if (position > fileContents.length()-1 && orth.length()==0) {
            haveMoreSentences = false;
            inEndOfSentence = false;
            lastWord = null;
            inNoSpace = false;
            inWord = false;
            return null;
        }
        lastWord = orth;

        //String[] interps = stemmer.stemAndForm(orth.toLowerCase(new Locale("pl", "PL")));
        String[] interps = stemmer.stemAndForm(orth);
        if (interps == null) {
            String upCased = orth.substring(1);
            upCased = Character.toUpperCase(orth.charAt(0)) + upCased.toLowerCase(new Locale("pl", "PL"));            
            interps = stemmer.stemAndForm(upCased);
        }

        LinkedList<Interpretation> interpretations = new LinkedList<Interpretation>();
        if (interps!=null)
        {  /* there are some interpretations */
            for (int j=0; j<interps.length-1; j+=2)
            {  /* for each returned interpretation */
                if (interps[j]!=null)
                if (interps[j+1]!=null) {
                   for (String interp_tags : interps[j+1].split("\\+")) {
                      for (String ftags : conf.tagset.cToFtagArray(interp_tags)) {
                          interpretations.add( new Interpretation(ftags + interps[j], false, conf) );
                      }   
                   }
                }
            }

        }

	/* if analyzer doesn't know what it is or ogonkification 
	   should be applied always, then try ogonkification and append all 
	   obtained Interpretations */
        if(conf.ogonkifyStrategy == 'M' && interpretations.isEmpty() ||
	   conf.ogonkifyStrategy == 'A') 
	    interpretations.addAll(ogonkify(orth));

	/* if analyzer still has no idea what it is, 
	   append 'ign' interpretation */
        if(interpretations.isEmpty())    
	    interpretations.add(new Interpretation(conf.tagset.cToFtag("ign") + orth, false, conf));
	
        return new Segment(null, orth, interpretations.toArray(new Interpretation[0]), conf);
    }

    /**
     * Generate a list of ogonkified words - candidates to check.
     * @param orth form to generate candidates from
     * @return
     */
    public List<String> generateCandidates(String orth) {
        int i;
        LinkedList<String> candidates = new LinkedList<String>();
        candidates.add("");

        i = 0;
        
        if((orth.length()>= conf.ogonkifyMinLength) && 
	   (orth.length()<= conf.ogonkifyMaxLength)) {
	    while(i < orth.length()) {
		String nowAdding = orth.substring(i, i+1);
                LinkedList<String> newcandidates = new LinkedList<String>();

		for(String candidate : candidates) {
		    newcandidates.add(candidate + nowAdding);
		    String[] substitutions = 
			conf.ogonkifySubstitutions.get(nowAdding);
		    if(substitutions != null) 
			for(String s : substitutions) 
			    newcandidates.add(candidate + s);
		}
                candidates = newcandidates;
                i ++;
            }
	}

        return candidates;
    }

    
    public void setFileContents(String fileContents) {
        this.fileContents = fileContents;
        haveMoreSentences = true;
        position = 0;
    }
   
    public boolean nextSentence(PrintStream out) {
        
        if (this.atBeginningOfFile) {
               out.print("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"+
                         //"<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"+
                         "<cesAna xmlns:xlink=\"http://www.w3.org/1999/xlink\" type=\"pre_morph\" version=\"IPI-1.2\">\n"+
                         "<chunkList>\n");  
               atBeginningOfFile = false;
        }
        if (!this.haveMoreSentences) {
                   out.println("</chunkList>\n"+
                               "</cesAna>\n");
        }
        return this.haveMoreSentences;
    }

    @Override
    public
    void close() {
    	//noop?
    	//TODO
    }

    
}
