package com.antbrains.nlp.wordseg;

import com.antbrains.crf.BESB1B2MTagConvertor;
import com.antbrains.crf.CrfModel;
import com.antbrains.crf.SgdCrf;
import com.antbrains.crf.TagConvertor;
import com.antbrains.nlp.datrie.DoubleArrayTrie;
import com.antbrains.nlp.wordseg.Token;
import com.antbrains.nlp.wordseg.luceneanalyzer.OffsetAttribute;
import com.antbrains.nlp.wordseg.luceneanalyzer.StandardTokenizer;
import com.antbrains.nlp.wordseg.luceneanalyzer.TypeAttribute;
import com.antbrains.nlp.wordseg.luceneanalyzer.Version;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/* loaded from: input_file:com/antbrains/nlp/wordseg/ChineseSegmenter.class */
public class ChineseSegmenter {
    private static ChineseSegmenter instance = new ChineseSegmenter();
    private MMSeg mmseg;
    private RMMSeg rmmseg;
    private TagConvertor tc = new BESB1B2MTagConvertor();
    private CrfModel model;

    private ChineseSegmenter() {
        try {
            InputStream resourceAsStream = ChineseSegmenter.class.getResourceAsStream("/crf.model");
            if (resourceAsStream == null) {
                throw new RuntimeException("can't find /crf.model");
            }
            this.model = SgdCrf.loadModel(resourceAsStream);
            resourceAsStream.close();
            InputStream resourceAsStream2 = ChineseSegmenter.class.getResourceAsStream("/segdict.txt");
            if (resourceAsStream2 == null) {
                System.err.println("warning: no segdict.txt");
                this.mmseg = new MMSeg();
                this.rmmseg = new RMMSeg();
                throw new RuntimeException("can't find /segdict.txt");
            }
            List<String> read2List = FileTools.read2List(resourceAsStream2, "UTF8");
            this.mmseg = new MMSeg(read2List);
            this.rmmseg = new RMMSeg(read2List);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static ChineseSegmenter getInstance() {
        return instance;
    }

    public MMSeg getMmseg() {
        return this.mmseg;
    }

    public RMMSeg getRmmseg() {
        return this.rmmseg;
    }

    public CrfModel getModel() {
        return this.model;
    }

    public List<List<Token>> processByLuceneAnalyzer(String str) {
        ArrayList arrayList = new ArrayList();
        StandardTokenizer standardTokenizer = new StandardTokenizer(Version.LUCENE_29, new StringReader(str));
        OffsetAttribute offsetAttribute = (OffsetAttribute) standardTokenizer.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttribute = (TypeAttribute) standardTokenizer.addAttribute(TypeAttribute.class);
        ArrayList arrayList2 = new ArrayList();
        int i = 0;
        boolean z = false;
        while (standardTokenizer.incrementToken()) {
            try {
                int startOffset = offsetAttribute.startOffset();
                int endOffset = offsetAttribute.endOffset();
                if (i < startOffset) {
                    for (int i2 = i; i2 < startOffset; i2++) {
                        if (arrayList2.size() > 0) {
                            arrayList.add(arrayList2);
                        }
                        arrayList2 = new ArrayList();
                        arrayList2.add(new Token(null, str, i2, i2 + 1, Token.Type.PUNCT));
                    }
                    z = false;
                }
                i = endOffset;
                String type = typeAttribute.type();
                Token token = new Token(str, startOffset, endOffset);
                if (type.equals("<IDEOGRAPHIC>")) {
                    token.setType(Token.Type.CWORD);
                    if (!z) {
                        if (arrayList2.size() > 0) {
                            arrayList.add(arrayList2);
                            arrayList2 = new ArrayList();
                        }
                        z = true;
                    }
                } else {
                    z = false;
                    if (arrayList2.size() > 0) {
                        arrayList.add(arrayList2);
                        arrayList2 = new ArrayList();
                    }
                    if (type.equals("<ALPHANUM>")) {
                        token.setType(Token.Type.ALPHA);
                    } else if (type.equals("<NUM>")) {
                        token.setType(Token.Type.NUMBER);
                    }
                }
                arrayList2.add(token);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (arrayList2.size() > 0) {
            arrayList.add(arrayList2);
            new ArrayList();
        }
        for (int i3 = i; i3 < str.length(); i3++) {
            ArrayList arrayList3 = new ArrayList();
            arrayList3.add(new Token(null, str, i3, i3 + 1, Token.Type.PUNCT));
            arrayList.add(arrayList3);
        }
        try {
            standardTokenizer.close();
        } catch (IOException e2) {
        }
        return arrayList;
    }

    public List<Token> seg(String str, DoubleArrayTrie doubleArrayTrie) {
        ArrayList arrayList = new ArrayList();
        for (List<Token> list : processByLuceneAnalyzer(str)) {
            if (list.size() < 2) {
                arrayList.addAll(list);
            } else {
                Iterator<Token> it = segmentSentence(list, doubleArrayTrie).iterator();
                while (it.hasNext()) {
                    arrayList.add(it.next());
                }
            }
        }
        return arrayList;
    }

    private String tokens2String(List<Token> list) {
        StringBuilder sb = new StringBuilder();
        Iterator<Token> it = list.iterator();
        while (it.hasNext()) {
            sb.append(it.next().getNormalizedText());
        }
        return sb.toString();
    }

    private boolean isEqual(List<Token> list, List<Token> list2) {
        if (list.size() != list2.size()) {
            return false;
        }
        for (int i = 0; i < list.size(); i++) {
            if (list.get(i).getLength() != list2.get(i).getLength()) {
                return false;
            }
        }
        return true;
    }

    private List<int[]> compareResult(List<Token> list, List<Token> list2) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        while (i < list.size() && i2 < list2.size()) {
            Token token = list.get(i);
            Token token2 = list2.get(i2);
            if (token.getLength() == token2.getLength()) {
                i++;
                i3 += token.getLength();
                i2++;
                i4 += token2.getLength();
            } else {
                int i5 = i;
                int i6 = i2;
                int i7 = i3;
                i3 += token.getLength();
                i4 += token2.getLength();
                while (i3 != i4) {
                    if (i3 < i4) {
                        i++;
                        i3 += list.get(i).getLength();
                    } else {
                        i2++;
                        i4 += list2.get(i2).getLength();
                    }
                }
                if (i3 != i4) {
                    break;
                }
                arrayList.add(new int[]{i7, i3, i5, i6, i, i2});
                i++;
                i2++;
            }
        }
        if (i < list.size() || i2 < list2.size()) {
            System.err.println("Unexpected here ");
            Iterator<Token> it = list.iterator();
            while (it.hasNext()) {
                System.out.print(it.next().getOrigText() + " ");
            }
            System.out.println();
            Iterator<Token> it2 = list2.iterator();
            while (it2.hasNext()) {
                System.out.print(it2.next().getOrigText() + " ");
            }
            System.out.println();
        }
        return arrayList;
    }

    private List<Token> selectBest(List<Token> list, List<Token> list2) {
        List<int[]> compareResult = compareResult(list, list2);
        ArrayList arrayList = new ArrayList();
        int i = 0;
        for (int[] iArr : compareResult) {
            for (int i2 = i; i2 < iArr[2]; i2++) {
                arrayList.add(list.get(i2));
            }
            boolean z = iArr[2] > 0;
            boolean z2 = iArr[4] < list.size() - 2;
            if (z && list.get(iArr[2] - 1).getLength() != list2.get(iArr[3] - 1).getLength()) {
                z = false;
            }
            if (z2 && list.get(iArr[4] + 2).getLength() != list2.get(iArr[5] + 2).getLength()) {
                z2 = false;
            }
            if (SgdCrf.getScore(token2Array(list.subList(z ? iArr[2] - 1 : iArr[2], z2 ? iArr[4] + 2 : iArr[4] + 1)), this.tc, this.model) >= SgdCrf.getScore(token2Array(list2.subList(z ? iArr[3] - 1 : iArr[3], z2 ? iArr[5] + 2 : iArr[5] + 1)), this.tc, this.model)) {
                arrayList.addAll(list.subList(iArr[2], iArr[4] + 1));
            } else {
                arrayList.addAll(list2.subList(iArr[3], iArr[5] + 1));
            }
            i = iArr[4] + 1;
        }
        for (int i3 = i; i3 < list.size(); i3++) {
            arrayList.add(list.get(i3));
        }
        return arrayList;
    }

    private List<Token> segmentSentence(List<Token> list, DoubleArrayTrie doubleArrayTrie) {
        String str = tokens2String(list);
        List<Token> seg = this.mmseg.seg(str, doubleArrayTrie);
        List<Token> seg2 = this.rmmseg.seg(str, doubleArrayTrie);
        return isEqual(seg, seg2) ? seg : selectBest(seg, seg2);
    }

    private String[] token2Array(List<Token> list) {
        String[] strArr = new String[list.size()];
        int i = 0;
        Iterator<Token> it = list.iterator();
        while (it.hasNext()) {
            int i2 = i;
            i++;
            strArr[i2] = it.next().getOrigText();
        }
        return strArr;
    }
}
