package org.dkpro.tc.features.ngram;

import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.features.meta.MetaCollectorConfiguration;
import org.dkpro.tc.api.type.TextClassificationTarget;
import org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase;
import org.dkpro.tc.features.ngram.meta.CharacterNGramMC;
import org.dkpro.tc.features.ngram.meta.LuceneMC;
import org.dkpro.tc.features.ngram.util.KeywordNGramUtils;

@TypeCapability(inputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"})
/* loaded from: input_file:org/dkpro/tc/features/ngram/CharacterNGram.class */
public class CharacterNGram extends LuceneFeatureExtractorBase implements FeatureExtractor {
    public Set<Feature> extract(JCas jCas, TextClassificationTarget textClassificationTarget) throws TextClassificationException {
        HashSet hashSet = new HashSet();
        FrequencyDistribution<String> annotationCharacterNgrams = CharacterNGramMC.getAnnotationCharacterNgrams(textClassificationTarget, this.ngramLowerCase, this.ngramMinN, this.ngramMaxN, '^', '$');
        for (String str : this.topKSet.getKeys()) {
            if (annotationCharacterNgrams.getKeys().contains(str)) {
                hashSet.add(new Feature(getFeaturePrefix() + KeywordNGramUtils.GLUE + str, 1, FeatureType.BOOLEAN));
            } else {
                hashSet.add(new Feature(getFeaturePrefix() + KeywordNGramUtils.GLUE + str, 0, true, FeatureType.BOOLEAN));
            }
        }
        return hashSet;
    }

    @Override // org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase, org.dkpro.tc.features.ngram.base.NGramFeatureExtractorBase
    protected String getFieldName() {
        return CharacterNGramMC.LUCENE_CHAR_NGRAM_FIELD + this.featureExtractorName;
    }

    @Override // org.dkpro.tc.features.ngram.base.NGramFeatureExtractorBase
    protected String getFeaturePrefix() {
        return getClass().getSimpleName();
    }

    @Override // org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase, org.dkpro.tc.features.ngram.base.NGramFeatureExtractorBase
    protected int getTopN() {
        return this.ngramUseTopK;
    }

    public List<MetaCollectorConfiguration> getMetaCollectorClasses(Map<String, Object> map) throws ResourceInitializationException {
        return Arrays.asList(new MetaCollectorConfiguration(CharacterNGramMC.class, map).addStorageMapping(LuceneMC.PARAM_TARGET_LOCATION, LuceneFeatureExtractorBase.PARAM_SOURCE_LOCATION, LuceneMC.LUCENE_DIR));
    }

    @Override // org.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase
    protected void logSelectionProcess(long j) {
        getLogger().log(Level.INFO, "+++ SELECTING THE " + j + " MOST FREQUENT CHARACTER [" + range() + "]-GRAMS (" + caseSensitivity() + ")");
    }

    private String range() {
        return this.ngramMinN == this.ngramMaxN ? this.ngramMinN + "" : this.ngramMinN + "-" + this.ngramMaxN;
    }

    private String caseSensitivity() {
        return this.ngramLowerCase ? "case-insensitive" : "case-sensitive";
    }
}
