/*
 * Decompiled with CFR 0.152.
 */
package org.apache.ctakes.smokingstatus.MLutil;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;

public class GenerateTrainingData {
    Set<String> keywords;
    Set<String> stopwords;
    List<List<Comparable>> features = new ArrayList<List<Comparable>>();

    GenerateTrainingData(String keywordsFileName, String stopwordsFileName) {
        this.stopwords = new HashSet<String>();
        this.keywords = new HashSet<String>();
        try {
            this.keywords = GenerateTrainingData.readLinesFromFile(keywordsFileName);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        try {
            this.stopwords = GenerateTrainingData.readLinesFromFile(stopwordsFileName);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static Set<String> readLinesFromFile(String fileName) throws IOException {
        String line;
        HashSet<String> returnValues = new HashSet<String>();
        File file = new File(fileName);
        BufferedReader fileReader = new BufferedReader(new FileReader(file));
        while ((line = fileReader.readLine()) != null) {
            if (line.startsWith("//") || line.trim().length() <= 0) continue;
            returnValues.add(line.toLowerCase());
        }
        return returnValues;
    }

    public void makeFeatures(String fname) {
        String str = "";
        String cls = "";
        String sen = "";
        try {
            BufferedReader fin = new BufferedReader(new FileReader(fname));
            while ((str = fin.readLine()) != null) {
                int i;
                if (str.length() == 0) continue;
                StringTokenizer strTok = new StringTokenizer(str, "|");
                while (strTok.hasMoreTokens()) {
                    sen = strTok.nextToken().trim();
                    cls = strTok.nextToken().trim();
                }
                if (cls.toLowerCase().startsWith("p")) {
                    cls = "PAST_SMOKER";
                } else if (cls.toLowerCase().startsWith("c")) {
                    cls = "CURRENT_SMOKER";
                } else if (cls.toLowerCase().startsWith("s")) {
                    cls = "SMOKER";
                } else {
                    System.out.println("Undefined class label:" + cls);
                    System.exit(1);
                }
                sen = sen.toLowerCase().replaceAll("[.?!:;()',\"{}<>#+]", " ").trim();
                sen = sen.toLowerCase().replaceAll("-{2,}", " ").trim();
                String[] senTokens = sen.split("\\s");
                ArrayList<String> unigrams = new ArrayList<String>();
                ArrayList<String> bigrams = new ArrayList<String>();
                for (i = 0; i < senTokens.length; ++i) {
                    if (this.stopwords.contains(senTokens[i]) || senTokens[i].trim().length() <= 0) continue;
                    unigrams.add(senTokens[i]);
                }
                for (i = 0; i < unigrams.size() - 1; ++i) {
                    bigrams.add((String)unigrams.get(i) + "_" + (String)unigrams.get(i + 1));
                }
                ArrayList<Object> feature = new ArrayList<Object>();
                for (String k : this.keywords) {
                    int i2;
                    int val = 0;
                    if (k.indexOf("_") != -1) {
                        for (i2 = 0; i2 < bigrams.size(); ++i2) {
                            if (!k.equalsIgnoreCase((String)bigrams.get(i2))) continue;
                            val = 1;
                            break;
                        }
                    } else {
                        for (i2 = 0; i2 < unigrams.size(); ++i2) {
                            if (!k.equalsIgnoreCase((String)unigrams.get(i2))) continue;
                            val = 1;
                            break;
                        }
                    }
                    feature.add(new Integer(val));
                }
                int hasYear = 0;
                for (int i3 = 0; i3 < unigrams.size(); ++i3) {
                    String s = (String)unigrams.get(i3);
                    if (!s.matches("19\\d\\d") && !s.matches("19\\d\\ds") && !s.matches("20\\d\\d") && !s.matches("20\\d\\ds") && !s.matches("[1-9]0s") && !s.matches("\\d{1,2}[/-]\\d{1,2}") && !s.matches("\\d{1,2}[/-]\\d{4}") && !s.matches("\\d{1,2}[/-]\\d{1,2}[/-]\\d{2}") && !s.matches("\\d{1,2}[/-]\\d{1,2}[/-]\\d{4}")) continue;
                    hasYear = 1;
                    break;
                }
                feature.add(new Integer(hasYear));
                feature.add(new String(cls));
                this.features.add(feature);
            }
            fin.close();
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    private void printLibsvmDataToFile(String fname) {
        try {
            PrintStream fout = new PrintStream(new FileOutputStream(fname));
            for (ArrayList arrayList : this.features) {
                String clsStr = (String)arrayList.get(arrayList.size() - 1);
                int cls = -1;
                if (clsStr.equals("CURRENT_SMOKER")) {
                    cls = 1;
                } else if (clsStr.equals("PAST_SMOKER")) {
                    cls = 2;
                } else if (clsStr.equals("SMOKER")) {
                    cls = 3;
                } else {
                    System.out.println("Undefined class label:" + clsStr);
                    System.exit(1);
                }
                fout.print(cls + " ");
                for (int i = 0; i < arrayList.size() - 1; ++i) {
                    fout.print(i + 1 + ":" + arrayList.get(i) + " ");
                }
                fout.print('\n');
            }
            fout.close();
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        String keywordsFile = "C:/cTAKES-1.0.5/smoking status/resources/ss/data/PCS/keywords_PCS_NHGRI.txt";
        String stopwordsFile = "C:/cTAKES-1.0.5/smoking status/resources/ss/data/PCS/stopwords_PCS.txt";
        String dataFile = "C:/Temp/SentenceLevelSmokingStatus_PCS.txt";
        String libsvmDataFile = "C:/Temp/libsvm_data.txt";
        GenerateTrainingData gtd = new GenerateTrainingData(keywordsFile, stopwordsFile);
        gtd.makeFeatures(dataFile);
        gtd.printLibsvmDataToFile(libsvmDataFile);
    }
}

