package org.dkpro.tc.features.ngram.meta;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.language.ColognePhonetic;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.NGramFeatureExtractorBase;
import org.dkpro.tc.features.ngram.util.KeywordNGramUtils;

/* loaded from: input_file:org/dkpro/tc/features/ngram/meta/PhoneticNGramMC.class */
public class PhoneticNGramMC extends LuceneMC {
    public static final String LUCENE_PHONETIC_NGRAM_FIELD = "phoneticngram";

    @ConfigurationParameter(name = NGramFeatureExtractorBase.PARAM_NGRAM_MIN_N, mandatory = true, defaultValue = {"1"})
    private int ngramMinN;

    @ConfigurationParameter(name = NGramFeatureExtractorBase.PARAM_NGRAM_MAX_N, mandatory = true, defaultValue = {"3"})
    private int ngramMaxN;

    @Override // org.dkpro.tc.features.ngram.meta.LuceneMC
    protected FrequencyDistribution<String> getNgramsFD(JCas jCas) throws TextClassificationException {
        return getDocumentPhoneticNgrams(jCas, new TextClassificationTarget(jCas, 0, jCas.getDocumentText().length()), this.ngramMinN, this.ngramMaxN);
    }

    @Override // org.dkpro.tc.features.ngram.meta.LuceneMC
    protected String getFieldName() {
        return LUCENE_PHONETIC_NGRAM_FIELD + this.featureExtractorName;
    }

    public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jCas, Annotation annotation, int i, int i2) throws TextClassificationException {
        Soundex colognePhonetic;
        String documentLanguage = jCas.getDocumentLanguage();
        if (documentLanguage.equals("en")) {
            colognePhonetic = new Soundex();
        } else {
            if (!documentLanguage.equals("de")) {
                throw new TextClassificationException("Language code '" + documentLanguage + "' not supported by phonetic ngrams FE.");
            }
            colognePhonetic = new ColognePhonetic();
        }
        FrequencyDistribution<String> frequencyDistribution = new FrequencyDistribution<>();
        for (Sentence sentence : JCasUtil.selectCovered(jCas, Sentence.class, annotation)) {
            ArrayList arrayList = new ArrayList();
            Iterator it = JCasUtil.selectCovered(jCas, Token.class, sentence).iterator();
            while (it.hasNext()) {
                try {
                    arrayList.add(colognePhonetic.encode(((Token) it.next()).getCoveredText()));
                } catch (EncoderException e) {
                    throw new TextClassificationException(e);
                }
            }
            Iterator it2 = new NGramStringListIterable((String[]) arrayList.toArray(new String[arrayList.size()]), i, i2).iterator();
            while (it2.hasNext()) {
                frequencyDistribution.inc(StringUtils.join((List) it2.next(), KeywordNGramUtils.GLUE));
            }
        }
        return frequencyDistribution;
    }
}
