/*
 * Decompiled with CFR 0.152.
 */
package org.cleartk.corpus.ace2005;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.SofaCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.cleartk.ne.type.Ace2005Document;
import org.cleartk.util.ViewUriUtil;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;

@SofaCapability(outputSofas={"ACE_2005_APF_URI_VIEW", "UriView"})
public class Ace2005GoldReader
extends JCasCollectionReader_ImplBase {
    public static final String PARAM_ACE_DIRECTORY_NAME = "aceDirectoryName";
    @ConfigurationParameter(name="aceDirectoryName", mandatory=true, description="Takes the name of directory that contains ACE data.  Typically, a folder such as \".../ACE_2005/optimization/English/all\".  The folder should contain files that come in pairs - i.e. for each .sgm file there should be a corresponding .apf.xml file.")
    private String aceDirectoryName;
    private static final String PARAM_ACE_FILE_NAMES_DESCRIPTION = "takes a file that contains the names of the files to read.   \nThe file should contain a list of the files in AceCorpusDir (one file name per line) \nthat you want read in. File names should not include the last suffix(es) (e.g. \".sgm\" or \"apf.xml\") \nIf parameter value is not given, then all files will be read in. An example file might look like this: \n\nAFP_ENG_20030304.0250\nAFP_ENG_20030305.0918\n...\n";
    public static final String PARAM_ACE_FILE_NAMES_FILE = "aceFileNamesFile";
    @ConfigurationParameter(name="aceFileNamesFile", description="takes a file that contains the names of the files to read.   \nThe file should contain a list of the files in AceCorpusDir (one file name per line) \nthat you want read in. File names should not include the last suffix(es) (e.g. \".sgm\" or \"apf.xml\") \nIf parameter value is not given, then all files will be read in. An example file might look like this: \n\nAFP_ENG_20030304.0250\nAFP_ENG_20030305.0918\n...\n")
    private String aceFileNamesFile;
    File[] aceFiles;
    int aceFileIndex;
    int aceFileCount;
    File currentSGMFile = null;
    public static final String TAG_REGEX = "<.*?>";
    Pattern tagPattern;

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void initialize(UimaContext context) throws ResourceInitializationException {
        if (!new File(this.aceDirectoryName).exists()) {
            throw new ResourceInitializationException((Throwable)new IOException(String.format("directory %s does not exist", this.aceDirectoryName)));
        }
        File aceDirectory = new File(this.aceDirectoryName);
        if (this.aceFileNamesFile != null && !this.aceFileNamesFile.trim().equals("")) {
            try {
                ArrayList<File> files = new ArrayList<File>();
                try (BufferedReader reader = new BufferedReader(new FileReader(this.aceFileNamesFile));){
                    String line;
                    while ((line = reader.readLine()) != null) {
                        if ((line = line.trim()).endsWith(".sgm")) {
                            files.add(new File(aceDirectory, line));
                            continue;
                        }
                        files.add(new File(aceDirectory, line + ".sgm"));
                    }
                }
                this.aceFiles = files.toArray(new File[files.size()]);
            }
            catch (IOException ioe) {
                throw new ResourceInitializationException((Throwable)ioe);
            }
            for (File file : this.aceFiles) {
                if (file.exists()) continue;
                throw new ResourceInitializationException("could_not_access_data", new Object[]{file});
            }
        } else {
            this.aceFiles = aceDirectory.listFiles();
        }
        this.aceFileIndex = 0;
        this.aceFileCount = 0;
        this.tagPattern = Pattern.compile(TAG_REGEX, 40);
    }

    private File getNextSGMFile() {
        if (this.currentSGMFile != null) {
            return this.currentSGMFile;
        }
        while (this.aceFileIndex < this.aceFiles.length) {
            File sgmFile;
            if (!(sgmFile = this.aceFiles[this.aceFileIndex++]).getName().endsWith(".sgm")) continue;
            this.currentSGMFile = sgmFile;
            return sgmFile;
        }
        return null;
    }

    private File getAPFFile(File sgmFile) {
        String apfFileName = sgmFile.getPath();
        apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "apf.xml";
        if (new File(apfFileName).exists()) {
            return new File(apfFileName);
        }
        apfFileName = sgmFile.getPath();
        apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "entities.apf.xml";
        if (new File(apfFileName).exists()) {
            return new File(apfFileName);
        }
        apfFileName = sgmFile.getPath();
        apfFileName = sgmFile.getPath().substring(0, apfFileName.length() - 3) + "mentions.apf.xml";
        if (new File(apfFileName).exists()) {
            return new File(apfFileName);
        }
        return null;
    }

    private String getDocumentText(String sgmText) {
        StringBuffer rawDocumentText = new StringBuffer(sgmText);
        Matcher tagMatcher = this.tagPattern.matcher(rawDocumentText);
        String documentText = tagMatcher.replaceAll("");
        return documentText;
    }

    public void getNext(JCas jCas) throws IOException, CollectionException {
        try {
            File sgmFile = this.getNextSGMFile();
            this.currentSGMFile = null;
            String sgmText = FileUtils.file2String((File)sgmFile);
            JCas initialView = jCas.getView("_InitialView");
            initialView.setDocumentText(this.getDocumentText(sgmText));
            File apfFile = this.getAPFFile(sgmFile);
            SAXBuilder builder = new SAXBuilder();
            builder.setDTDHandler(null);
            Document doc = builder.build(apfFile);
            Element apfSource = doc.getRootElement();
            String uri = apfSource.getAttributeValue("URI");
            String source = apfSource.getAttributeValue("SOURCE");
            String type = apfSource.getAttributeValue("TYPE");
            ViewUriUtil.setURI((JCas)jCas, (URI)sgmFile.toURI());
            Ace2005Document document = new Ace2005Document(initialView);
            document.setAceUri(uri);
            document.setAceSource(source);
            document.setAceType(type);
            document.addToIndexes();
            JCas apfUriView = jCas.createView("ACE_2005_APF_URI_VIEW");
            apfUriView.setSofaDataURI(apfFile.toURI().toString(), null);
        }
        catch (CASException ce) {
            throw new CollectionException((Throwable)ce);
        }
        catch (JDOMException je) {
            throw new CollectionException((Throwable)je);
        }
    }

    public void close() throws IOException {
    }

    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.aceFileIndex, this.aceFiles.length, "entities")};
    }

    public boolean hasNext() throws IOException, CollectionException {
        return this.getNextSGMFile() != null;
    }

    public void setAceDirectoryName(String aceDirectoryName) {
        this.aceDirectoryName = aceDirectoryName;
    }

    public void setAceFileNamesFile(String aceFileNamesFile) {
        this.aceFileNamesFile = aceFileNamesFile;
    }
}

