package org.opensextant.xtext;

import gnu.getopt.Getopt;
import gnu.getopt.LongOpt;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import javax.activation.MimeType;
import javax.activation.MimeTypeParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.io.IOUtils;
import org.opensextant.ConfigException;
import org.opensextant.util.FileUtility;
import org.opensextant.xtext.collectors.ArchiveNavigator;
import org.opensextant.xtext.collectors.mailbox.OutlookPSTCrawler;
import org.opensextant.xtext.converters.DefaultConverter;
import org.opensextant.xtext.converters.EmbeddedContentConverter;
import org.opensextant.xtext.converters.ImageMetadataConverter;
import org.opensextant.xtext.converters.MessageConverter;
import org.opensextant.xtext.converters.TextTranscodingConverter;
import org.opensextant.xtext.converters.TikaHTMLConverter;
import org.opensextant.xtext.converters.WebArchiveConverter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/opensextant/xtext/XText.class */
public final class XText implements ExclusionFilter, Converter {
    public static Map<String, Converter> converters = new HashMap();
    private Converter defaultConversion;
    private Converter embeddedConversion;
    public static final long FILE_SIZE_LIMIT = 33554432;
    private final Logger log = LoggerFactory.getLogger(getClass());
    private boolean scrubHTML = false;
    private final PathManager paths = new PathManager();
    private boolean extractEmbedded = false;
    private int maxBuffer = DefaultConverter.MAX_TEXT_SIZE;
    private final int maxHTMLBuffer = 5 * this.maxBuffer;
    private long maxFileSize = FILE_SIZE_LIMIT;
    protected Set<String> archiveFileTypes = new HashSet();
    private final Set<String> requestedFileTypes = new HashSet();
    private final Set<String> ignoreFileTypes = new HashSet();
    private boolean allowNoExtension = false;
    private ConversionListener postProcessor = null;
    private boolean useTikaPST = false;
    protected long total_conv_time = 0;
    protected int average_conv_time = 0;
    protected int total_conversions = 0;
    protected long start_time = 0;
    protected long stop_time = 0;
    private String[] fileFilters = null;

    /* loaded from: input_file:org/opensextant/xtext/XText$MainProgramListener.class */
    static class MainProgramListener implements ConversionListener {
        private final Logger log = LoggerFactory.getLogger(getClass());
        private boolean verbosity;

        public MainProgramListener(boolean z) {
            this.verbosity = false;
            this.verbosity = z;
        }

        @Override // org.opensextant.xtext.ConversionListener
        public void handleConversion(ConvertedDocument convertedDocument, String str) {
            boolean z = false;
            if (convertedDocument != null) {
                z = convertedDocument.is_converted;
            }
            Logger logger = this.log;
            Object[] objArr = new Object[3];
            objArr[0] = str;
            objArr[1] = Boolean.valueOf(convertedDocument != null);
            objArr[2] = Boolean.valueOf(z);
            logger.info("Converted. FILE={} Status={}, Converted={}", objArr);
            if (this.verbosity) {
                this.log.info("\t {}", convertedDocument.getProperties());
            }
        }
    }

    public PathManager getPathManager() {
        return this.paths;
    }

    public XText() {
        defaults();
    }

    public void enableOverwrite(boolean z) {
        ConvertedDocument.overwrite = z;
    }

    @Deprecated
    public void setArchiveDir(String str) throws IOException {
        this.paths.setConversionCache(str);
    }

    public void setMaxBufferSize(int i) {
        this.maxBuffer = i;
    }

    public void setMaxFileSize(int i) {
        this.maxFileSize = i;
    }

    public void enableNoFileExtension(boolean z) {
        this.allowNoExtension = z;
    }

    public void enableHTMLScrubber(boolean z) {
        this.scrubHTML = z;
    }

    public void enableEmbeddedExtraction(boolean z) {
        this.extractEmbedded = z;
    }

    public void enableSaving(boolean z) {
        this.paths.enableSaving(z);
    }

    public void convertFileType(String str) {
        this.requestedFileTypes.add(str.toLowerCase());
    }

    public void ignoreFileType(String str) {
        if (str != null) {
            this.ignoreFileTypes.add(str.toLowerCase());
        }
    }

    public void setConversionListener(ConversionListener conversionListener) {
        this.postProcessor = conversionListener;
    }

    public void enableTikaPST(boolean z) {
        this.useTikaPST = z;
    }

    public boolean isArchive(String str) {
        String extension = FilenameUtils.getExtension(str);
        if (extension == null) {
            return false;
        }
        return this.archiveFileTypes.contains(extension.toLowerCase());
    }

    public boolean isPST(String str) {
        return isPSTExtension(FilenameUtils.getExtension(str));
    }

    public static boolean isPSTExtension(String str) {
        if (str == null) {
            return false;
        }
        return "pst".equalsIgnoreCase(str);
    }

    protected void trackStatistics(ConvertedDocument convertedDocument) {
        if (convertedDocument != null) {
            this.total_conv_time += convertedDocument.conversion_time;
        }
        this.total_conversions++;
    }

    public void reportStatistics() {
        this.average_conv_time = (int) (((float) this.total_conv_time) / this.total_conversions);
        this.log.info("TOTAL of N=" + this.total_conversions + " documents converted\n With an average time (ms) of " + this.average_conv_time);
    }

    public void extractText(String str) throws IOException, ConfigException {
        this.start_time = System.currentTimeMillis();
        this.log.info("Conversion.  INPUT PATH={}", str);
        String normalize = FilenameUtils.normalize(new File(str).getAbsolutePath(), true);
        if (normalize == null) {
            throw new IOException("Failed to normalize the path: " + str);
        }
        File file = new File(normalize);
        if (!file.exists()) {
            throw new IOException("Non existent input FILE=" + normalize);
        }
        if (PathManager.isXTextCache(normalize)) {
            throw new ConfigException("XText cannot be directed to extract text from its own cache files. Move the cache files out of ./xtext/ folders if you really need to do this.");
        }
        if (isArchive(file.getName())) {
            convertArchive(file);
        } else if (isPST(file.getName()) && !this.useTikaPST) {
            convertOutlookPST(file);
        } else if (file.isFile()) {
            this.paths.setInputRoot(file);
            convertFile(file);
        } else if (file.isDirectory()) {
            this.paths.setInputRoot(file);
            convertFolder(file);
        }
        this.stop_time = System.currentTimeMillis();
        if (this.paths.isSaving()) {
            if (this.paths.isSaveWithInput()) {
                this.log.info("Output can be accessed at from the input folder {} in 'xtext' sub-folders", file.getParent());
            } else {
                this.log.info("Output can be accessed at " + this.paths.getConversionCache());
            }
        }
        reportStatistics();
    }

    private boolean filterOutFile(File file) {
        if (PathManager.isXTextCache(file)) {
            return true;
        }
        return filterOutFile(file.getAbsolutePath());
    }

    @Override // org.opensextant.xtext.ExclusionFilter
    public boolean filterOutFile(String str) {
        if (PathManager.isXTextCache(str)) {
            return true;
        }
        String baseName = FilenameUtils.getBaseName(str);
        if (baseName.startsWith(".") || str.contains(".svn") || baseName.endsWith(ConvertedDocument.CONVERTED_TEXT_EXT)) {
            return true;
        }
        String extension = FilenameUtils.getExtension(str);
        return StringUtils.isBlank(extension) ? !this.allowNoExtension : !this.requestedFileTypes.contains(extension.toLowerCase());
    }

    public void convertArchive(File file) throws IOException, ConfigException {
        if (this.paths.verifyArchiveExport(file.getAbsolutePath())) {
            File archiveExportDir = this.paths.getArchiveExportDir(file);
            String stipPrefixPath = this.paths.getStipPrefixPath();
            this.paths.setStripPrefixPath(archiveExportDir.getAbsolutePath());
            this.paths.setInputRoot(archiveExportDir);
            ArchiveNavigator archiveNavigator = new ArchiveNavigator(file, archiveExportDir.getAbsolutePath(), this, this);
            archiveNavigator.overwrite = ConvertedDocument.overwrite;
            this.log.info("\tArchive Found ({}). Expanding to {}", file, archiveExportDir);
            archiveNavigator.collect();
            this.paths.setStripPrefixPath(stipPrefixPath);
        }
    }

    public void convertOutlookPST(File file) throws ConfigException, IOException {
        if (!this.paths.isSaving()) {
            this.log.error("Warning -- PST file found, but save = true is required to parse it.  Enable saving and chose a cache folder");
        }
        OutlookPSTCrawler outlookPSTCrawler = new OutlookPSTCrawler(file);
        outlookPSTCrawler.setConverter(this);
        outlookPSTCrawler.overwriteMode = ConvertedDocument.overwrite;
        outlookPSTCrawler.incrementalMode = true;
        File archiveExportDir = this.paths.getArchiveExportDir(file);
        String stipPrefixPath = this.paths.getStipPrefixPath();
        this.paths.setStripPrefixPath(archiveExportDir.getAbsolutePath());
        this.paths.setInputRoot(archiveExportDir);
        outlookPSTCrawler.setOutputPSTDir(archiveExportDir);
        outlookPSTCrawler.configure();
        this.log.info("\tPST Email Archive Found ({}). Expanding to {}", file, archiveExportDir);
        try {
            outlookPSTCrawler.collect();
            this.paths.setStripPrefixPath(stipPrefixPath);
        } catch (Exception e) {
            throw new ConfigException("Unable to fully digest PST file " + file, e);
        }
    }

    @Override // org.opensextant.xtext.Converter
    public ConvertedDocument convert(File file) throws IOException, ConfigException {
        return convertFile(file);
    }

    @Override // org.opensextant.xtext.Converter
    public ConvertedDocument convert(String str) throws IOException {
        throw new IOException("Unsupported interface:  To convert text or binary data directly you must use an instance of a XText converter, e.g., TikaHTMLConverter");
    }

    public ConvertedDocument convertFile(File file) throws IOException, ConfigException {
        return convertFile(file, null);
    }

    public ConvertedDocument convertFile(File file, ConvertedDocument convertedDocument) throws IOException, ConfigException {
        if (convertedDocument == null && filterOutFile(file)) {
            return null;
        }
        if (this.paths.isSaving() && !this.paths.isSaveWithInput() && !this.paths.hasInputRoot()) {
            throw new IOException("Please set an input root; convertFile() was called in save/cache mode without having PathManager setup");
        }
        String name = file.getName();
        String lowerCase = FilenameUtils.getExtension(name).toLowerCase();
        if (!this.allowNoExtension && (this.ignoreFileTypes.contains(lowerCase) || !this.requestedFileTypes.contains(lowerCase))) {
            return null;
        }
        this.log.debug("Converting FILE=" + file.getAbsolutePath());
        if (isArchive(name)) {
            convertArchive(file);
            return null;
        }
        if (isPSTExtension(lowerCase) && !this.useTikaPST) {
            convertOutlookPST(file);
            return null;
        }
        if (FileUtils.sizeOf(file) > this.maxFileSize) {
            this.log.info("Valid File is too large FILE=" + file.getAbsolutePath());
            return null;
        }
        boolean z = true;
        Converter converter = converters.get(lowerCase);
        if (converter == null) {
            if (this.extractEmbedded && EmbeddedContentConverter.isSupported(lowerCase)) {
                converter = this.embeddedConversion;
                z = false;
            } else {
                converter = this.defaultConversion;
            }
        }
        ConvertedDocument convertedDocument2 = null;
        if (z && !ConvertedDocument.overwrite && this.paths.isSaving()) {
            convertedDocument2 = this.paths.getCachedConversion(file);
        }
        if (convertedDocument2 == null) {
            long currentTimeMillis = System.currentTimeMillis();
            try {
                convertedDocument2 = converter.convert(file);
                int currentTimeMillis2 = (int) (System.currentTimeMillis() - currentTimeMillis);
                if (convertedDocument2 == null) {
                    convertedDocument2 = new ConvertedDocument(file);
                } else if (this.paths.isSaving() && convertedDocument2.is_converted) {
                    if (convertedDocument != null) {
                        convertedDocument2.setParent(convertedDocument);
                    }
                    this.paths.saveConversion(convertedDocument2);
                    if (convertedDocument2.hasRawChildren()) {
                        convertChildren(convertedDocument2);
                    }
                }
                convertedDocument2.conversion_time = currentTimeMillis2;
                if (convertedDocument2.filetime == null) {
                    convertedDocument2.filetime = convertedDocument2.getFiletime();
                }
            } catch (Exception e) {
                throw new IOException("Conversion error FILE=" + file.getPath(), e);
            }
        }
        if (this.postProcessor != null && convertedDocument == null) {
            this.postProcessor.handleConversion(convertedDocument2, file.getAbsolutePath());
        }
        trackStatistics(convertedDocument2);
        return convertedDocument2;
    }

    public void convertFolder(File file) throws IOException {
        for (File file2 : FileUtils.listFiles(file, new SuffixFileFilter(this.fileFilters, IOCase.INSENSITIVE), FileFilterUtils.trueFileFilter())) {
            try {
                convertFile(file2);
            } catch (Exception e) {
                this.log.error("Conversion error, FILE=" + file2.getPath(), e);
            }
        }
    }

    public void convertChildren(ConvertedDocument convertedDocument) throws IOException {
        if (convertedDocument.is_webArchive) {
            return;
        }
        convertedDocument.evalParentChildContainer();
        FileUtility.makeDirectory(convertedDocument.parentContainer);
        String absolutePath = convertedDocument.parentContainer.getAbsolutePath();
        for (Content content : convertedDocument.getRawChildren()) {
            if (content.content == null) {
                this.log.error("Attempted to write out child object with no content {}", content.id);
            } else {
                FileOutputStream fileOutputStream = null;
                try {
                    try {
                        File file = new File(FilenameUtils.concat(absolutePath, content.id));
                        fileOutputStream = new FileOutputStream(file);
                        IOUtils.write(content.content, fileOutputStream);
                        ConvertedDocument convertFile = convertFile(file, convertedDocument);
                        if (convertFile != null) {
                            if (convertFile.is_converted) {
                                for (String str : content.meta.stringPropertyNames()) {
                                    convertFile.addUserProperty(str, content.meta.getProperty(str));
                                }
                                convertFile.saveBuffer(new File(convertFile.textpath));
                            }
                            if (content.mimeType != null) {
                                try {
                                    convertFile.setMimeType(new MimeType(content.mimeType));
                                } catch (MimeTypeParseException e) {
                                    this.log.warn("Invalid mime type encountered: {} ignoring.", content.mimeType);
                                }
                            }
                            convertedDocument.addChild(convertFile);
                        }
                        if (fileOutputStream != null) {
                            fileOutputStream.close();
                        }
                    } catch (Exception e2) {
                        this.log.error("Failed to write out child {}, but will continue with others", content.id, e2);
                        if (fileOutputStream != null) {
                            fileOutputStream.close();
                        }
                    }
                } catch (Throwable th) {
                    if (fileOutputStream != null) {
                        fileOutputStream.close();
                    }
                    throw th;
                }
            }
        }
    }

    public void defaults() {
        this.archiveFileTypes.add("zip");
        this.archiveFileTypes.add("gz");
        this.archiveFileTypes.add("tar");
        this.archiveFileTypes.add("tgz");
        this.archiveFileTypes.add("tar.gz");
        this.requestedFileTypes.add("doc");
        this.requestedFileTypes.add("docx");
        this.requestedFileTypes.add("pdf");
        this.requestedFileTypes.add("htm");
        this.requestedFileTypes.add("html");
        this.requestedFileTypes.add("txt");
        this.requestedFileTypes.add("msg");
        this.requestedFileTypes.add("eml");
        this.requestedFileTypes.add("emlx");
        this.requestedFileTypes.add("ppt");
        this.requestedFileTypes.add("pptx");
        this.requestedFileTypes.add("xlsx");
        this.requestedFileTypes.add("xls");
        this.requestedFileTypes.add("rtf");
        this.requestedFileTypes.add("dot");
        this.requestedFileTypes.add("dotx");
        this.requestedFileTypes.add("odt");
        this.requestedFileTypes.add("odf");
        this.requestedFileTypes.add("docm");
        this.requestedFileTypes.add("mht");
        this.requestedFileTypes.add("jpg");
        this.requestedFileTypes.add("jpeg");
        this.requestedFileTypes.add("pst");
    }

    public void clearSettings() {
        this.requestedFileTypes.clear();
        converters.clear();
    }

    public void setup() throws IOException {
        this.defaultConversion = new DefaultConverter(this.maxBuffer);
        this.embeddedConversion = new EmbeddedContentConverter(this.maxBuffer);
        this.paths.configure();
        if (this.requestedFileTypes.contains("txt")) {
            converters.put("txt", new TextTranscodingConverter());
        }
        if (this.requestedFileTypes.contains("html")) {
            TikaHTMLConverter tikaHTMLConverter = new TikaHTMLConverter(this.scrubHTML, this.maxHTMLBuffer);
            converters.put("html", tikaHTMLConverter);
            converters.put("htm", tikaHTMLConverter);
            converters.put("xhtml", tikaHTMLConverter);
            this.requestedFileTypes.add("htm");
            this.requestedFileTypes.add("xhtml");
        }
        MessageConverter messageConverter = new MessageConverter();
        if (this.requestedFileTypes.contains("eml")) {
            converters.put("eml", messageConverter);
        }
        if (this.requestedFileTypes.contains("msg")) {
            converters.put("msg", messageConverter);
        }
        WebArchiveConverter webArchiveConverter = new WebArchiveConverter();
        if (this.requestedFileTypes.contains("mht")) {
            converters.put("mht", webArchiveConverter);
        }
        ImageMetadataConverter imageMetadataConverter = new ImageMetadataConverter();
        for (String str : new String[]{"jpeg", "jpg"}) {
            if (this.requestedFileTypes.contains(str)) {
                converters.put(str, imageMetadataConverter);
            }
        }
        Iterator<String> it = this.requestedFileTypes.iterator();
        while (it.hasNext()) {
            ignoreFileType(it.next() + ".txt");
        }
        this.fileFilters = (String[]) this.requestedFileTypes.toArray(new String[this.requestedFileTypes.size()]);
    }

    public Set<String> getFileTypes() {
        return this.requestedFileTypes;
    }

    public static void usage() {
        System.out.println();
        System.out.println("==========XText Usage=============");
        System.out.println("XText --input input  [--help] \n\t[--embed-conversion | --output folder ]   \n\t[--embed-children   | --export folder] \n\t[--clean-html]   [--strip-prefix path]");
        System.out.println(" --help  print this message");
        System.out.println(" --input  where <input> is file or folder");
        System.out.println(" --output  where <folder> is output is a folder where you want to archive converted docs");
        System.out.println(" --embed-children embeds the saved conversions in the input folder under 'xtext/'");
        System.out.println(" --embed-conversion embeds the extracted children binaries in the input folder");
        System.out.println("     (NOT the conversions, the binaries from Archives, PST, etc)");
        System.out.println("     Default behavior is to extract originals to output archive.");
        System.out.println(" --export folder\tOpposite of -c. Extract children and save to <folder>");
        System.out.println("     NOTE: -e has same effect as setting output to input");
        System.out.println(" -clean-html enables HTML scrubbing");
        System.out.println("========================");
    }

    public static void main(String[] strArr) {
        int i;
        Getopt getopt = new Getopt("XText", strArr, "", new LongOpt[]{new LongOpt("input", 1, (StringBuffer) null, 105), new LongOpt("output", 1, (StringBuffer) null, 111), new LongOpt("export", 1, (StringBuffer) null, 120), new LongOpt("strip-prefix", 1, (StringBuffer) null, 112), new LongOpt("verbose", 0, (StringBuffer) null, 118), new LongOpt("help", 0, (StringBuffer) null, 104), new LongOpt("clean-html", 0, (StringBuffer) null, 72), new LongOpt("embed-conversion", 0, (StringBuffer) null, 101), new LongOpt("embed-children", 0, (StringBuffer) null, 99), new LongOpt("tika-pst", 0, (StringBuffer) null, 84)});
        String str = null;
        String str2 = null;
        boolean z = false;
        boolean z2 = false;
        boolean z3 = false;
        boolean z4 = false;
        String str3 = null;
        String str4 = null;
        XText xText = new XText();
        while (true) {
            try {
                i = getopt.getopt();
            } catch (Exception e) {
                usage();
                System.exit(1);
            }
            if (i == -1) {
                if (str == null) {
                    System.out.println("An input argument is required, e.g., -input=/Folder/...");
                    System.exit(-1);
                }
                xText.enableOverwrite(true);
                xText.enableSaving(z || str2 != null);
                xText.getPathManager().enableSaveWithInput(z);
                xText.enableHTMLScrubber(z2);
                xText.getPathManager().enableSaveChildrenWithInput(z3);
                xText.getPathManager().setStripPrefixPath(str4);
                if (!z3 && str3 != null) {
                    xText.getPathManager().setExtractedChildrenCache(str3);
                }
                try {
                    if (!z) {
                        if (str2 == null) {
                            xText.enableSaving(true);
                            FileUtility.makeDirectory("output");
                            xText.getPathManager().setConversionCache("output");
                            System.out.println("Default output folder is $PWD/output");
                        } else {
                            xText.enableSaving(true);
                            xText.getPathManager().setConversionCache(str2);
                        }
                    }
                    xText.setConversionListener(new MainProgramListener(z4));
                    xText.setup();
                    xText.extractText(str);
                    return;
                } catch (IOException e2) {
                    usage();
                    e2.printStackTrace();
                    return;
                }
            }
            switch (i) {
                case 0:
                    break;
                case 72:
                    z2 = true;
                    break;
                case 84:
                    xText.enableTikaPST(true);
                    break;
                case 99:
                    z3 = true;
                    break;
                case 101:
                    z = true;
                    System.out.println("Saving conversions to Input folder.  Output folder will be ignored.");
                    break;
                case 104:
                default:
                    usage();
                    System.exit(1);
                    break;
                case 105:
                    str = getopt.getOptarg();
                    break;
                case 111:
                    str2 = getopt.getOptarg();
                    break;
                case 112:
                    str4 = getopt.getOptarg();
                    break;
                case 118:
                    z4 = true;
                    break;
                case 120:
                    str3 = getopt.getOptarg();
                    break;
            }
        }
    }
}
