/*
 * Decompiled with CFR 0.152.
 */
package org.cleartk.token.tokenizer;

import java.util.regex.Pattern;
import org.cleartk.token.tokenizer.Tokenizer_ImplBase;

public class PennTreebankTokenizer
extends Tokenizer_ImplBase {
    public static String openBracesRegex;
    public static String closedBracesRegex;
    public static String bracesRegex;
    public static Pattern bracesPattern;
    public static String ellipsisRegex;
    public static Pattern ellipsisPattern;
    public static String commaRegex;
    public static Pattern commaPattern;
    public static String dollarSignRegex;
    public static Pattern dollarSignPattern;
    public static String ampersandRegex;
    public static Pattern ampersandPattern;
    public static String dashRegex;
    public static Pattern dashPattern;
    public static String colonRegex;
    public static Pattern colonPattern;
    public static String nonFinalPunctRegex;
    public static Pattern nonFinalPunctPattern;
    public static String periodRegex;
    public static Pattern periodPattern;
    public static String nonPeriodPunctRegex;
    public static Pattern nonPeriodPunctPattern;
    public static String singleQuoteRegex;
    public static Pattern singleQuotePattern;
    public static String tripleQuoteRegex;
    public static Pattern tripleQuotePattern;
    public static String doubleQuoteRegex;
    public static Pattern doubleQuotePattern;
    public static String quoteRegex;
    public static Pattern quotePattern;
    public static String oneWordAbbreviationRegex;
    public static Pattern oneWordAbbreviationPattern;
    public static String[] twoWordAbbreviationRegexes;
    public static Pattern[] twoWordAbbreviationPatterns;
    public static String[] threeWordAbbreviationRegexes;
    public static Pattern[] threeWordAbbreviationPatterns;
    public static String tAbbreviationRegex;
    public static Pattern tAbbreviationPattern;
    public static String beginOrEndRegex;
    public static Pattern beginOrEndPattern;
    public static String extraSpaceRegex;
    public static Pattern extraSpacePattern;
    public static String multipleWhitespaceRegex;
    public static Pattern multipleWhitespacePattern;
    protected Pattern[] patterns = new Pattern[]{ellipsisPattern, commaPattern, dollarSignPattern, ampersandPattern, dashPattern, colonPattern, nonFinalPunctPattern, periodPattern, nonPeriodPunctPattern, bracesPattern};

    @Override
    public String[] getTokenTexts(String text) {
        for (Pattern pattern : this.patterns) {
            text = pattern.matcher(text).replaceAll(" $1 ");
        }
        text = beginOrEndPattern.matcher(text).replaceAll(" ");
        text = tripleQuotePattern.matcher(text).replaceAll(" ' '' ");
        text = doubleQuotePattern.matcher(text).replaceAll(" '' ");
        text = singleQuotePattern.matcher(text).replaceAll(" $1 ");
        text = quotePattern.matcher(text).replaceAll(" \" ");
        text = oneWordAbbreviationPattern.matcher(text).replaceAll(" $1");
        for (Pattern pattern : twoWordAbbreviationPatterns) {
            text = pattern.matcher(text).replaceAll(" $1 $2");
        }
        text = tAbbreviationPattern.matcher(text).replaceAll(" $1 $2");
        for (Pattern pattern : threeWordAbbreviationPatterns) {
            text = pattern.matcher(text).replaceAll(" $1 $2 $3");
        }
        text = extraSpacePattern.matcher(text).replaceAll("");
        String[] tokens = (text = multipleWhitespacePattern.matcher(text).replaceAll(" ")).toString().split(" ");
        if (tokens.length == 1 && tokens[0].equals("")) {
            tokens = new String[]{};
        }
        return tokens;
    }

    static {
        int i;
        openBracesRegex = "\\[\\(\\{\\<";
        closedBracesRegex = "\\]\\)\\}\\>";
        bracesRegex = "([" + openBracesRegex + closedBracesRegex + "])";
        bracesPattern = Pattern.compile(bracesRegex);
        ellipsisRegex = "(" + Pattern.quote("...") + ")";
        ellipsisPattern = Pattern.compile(ellipsisRegex);
        commaRegex = "((?<!\\d),|,(?!\\d))";
        commaPattern = Pattern.compile(commaRegex);
        dollarSignRegex = "([A-Z]*\\$)";
        dollarSignPattern = Pattern.compile(dollarSignRegex);
        ampersandRegex = "((?<![A-Z])&|&(?![A-Z]))";
        ampersandPattern = Pattern.compile(ampersandRegex);
        dashRegex = "(--+|-(?=\\s))";
        dashPattern = Pattern.compile(dashRegex);
        colonRegex = "(\\d+:\\d+|:)";
        colonPattern = Pattern.compile(colonRegex);
        nonFinalPunctRegex = "(``|[|;@#`%])(?!-)";
        nonFinalPunctPattern = Pattern.compile(nonFinalPunctRegex);
        periodRegex = "((?<=\\d)\\.(?=[^\\n\\S])|(?<=[^.]\\.)\\.(?![.])|(?<!\\.\\.)\\.[" + closedBracesRegex + "\"'`/_#*\\s]*$)";
        periodPattern = Pattern.compile(periodRegex, 8);
        nonPeriodPunctRegex = "([?!])";
        nonPeriodPunctPattern = Pattern.compile(nonPeriodPunctRegex);
        singleQuoteRegex = "('\\d+s?|(?<=\\s)'(?!')|(?<!')'(?=\\s))";
        singleQuotePattern = Pattern.compile(singleQuoteRegex);
        tripleQuoteRegex = "'''";
        tripleQuotePattern = Pattern.compile(tripleQuoteRegex);
        doubleQuoteRegex = "''";
        doubleQuotePattern = Pattern.compile(doubleQuoteRegex);
        quoteRegex = Pattern.quote("\"");
        quotePattern = Pattern.compile(quoteRegex);
        oneWordAbbreviationRegex = "('ll|'re|'ve|n't|'[smd])\\b";
        oneWordAbbreviationPattern = Pattern.compile(oneWordAbbreviationRegex, 2);
        twoWordAbbreviationRegexes = new String[]{"\\b(can)(not)\\b", "\\b(d')(ye)\\b", "\\b(gim)(me)\\b", "\\b(gon)(na)\\b", "\\b(got)(ta)\\b", "\\b(lem)(me)\\b", "\\b(more)('n)\\b", "\\b(wan)(na)\\b"};
        twoWordAbbreviationPatterns = new Pattern[twoWordAbbreviationRegexes.length];
        for (i = 0; i < twoWordAbbreviationRegexes.length; ++i) {
            PennTreebankTokenizer.twoWordAbbreviationPatterns[i] = Pattern.compile(twoWordAbbreviationRegexes[i], 2);
        }
        threeWordAbbreviationRegexes = new String[]{"\\b(wha)(dd)(ya)\\b", "\\b(wha)(t)(cha)\\b"};
        threeWordAbbreviationPatterns = new Pattern[threeWordAbbreviationRegexes.length];
        for (i = 0; i < threeWordAbbreviationRegexes.length; ++i) {
            PennTreebankTokenizer.threeWordAbbreviationPatterns[i] = Pattern.compile(threeWordAbbreviationRegexes[i], 2);
        }
        tAbbreviationRegex = "('t)(is|was)\\b";
        tAbbreviationPattern = Pattern.compile(tAbbreviationRegex);
        beginOrEndRegex = "^|$";
        beginOrEndPattern = Pattern.compile(beginOrEndRegex, 8);
        extraSpaceRegex = "^(\\s+)|(\\s+)$|(?<=[ \\t])[ \\t]+";
        extraSpacePattern = Pattern.compile(extraSpaceRegex, 8);
        multipleWhitespaceRegex = "(\\s+)";
        multipleWhitespacePattern = Pattern.compile(multipleWhitespaceRegex, 8);
    }
}

