/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.eval.app;

import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.tika.batch.FileResource;
import org.apache.tika.eval.app.AbstractProfiler;
import org.apache.tika.eval.app.EvalFilePaths;
import org.apache.tika.eval.app.db.ColInfo;
import org.apache.tika.eval.app.db.Cols;
import org.apache.tika.eval.app.db.TableInfo;
import org.apache.tika.eval.app.io.ExtractReader;
import org.apache.tika.eval.app.io.ExtractReaderException;
import org.apache.tika.eval.app.io.IDBWriter;
import org.apache.tika.eval.core.util.ContentTags;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;

public class ExtractProfiler
extends AbstractProfiler {
    private static final String FIELD = "f";
    public static TableInfo EXTRACT_EXCEPTION_TABLE = new TableInfo("extract_exceptions", new ColInfo(Cols.CONTAINER_ID, 4), new ColInfo(Cols.FILE_PATH, 12, 1024), new ColInfo(Cols.EXTRACT_EXCEPTION_ID, 4), new ColInfo(Cols.PARSE_ERROR_ID, 4));
    public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions", new ColInfo(Cols.ID, 4, "PRIMARY KEY"), new ColInfo(Cols.ORIG_STACK_TRACE, 12, 8192), new ColInfo(Cols.SORT_STACK_TRACE, 12, 8192), new ColInfo(Cols.PARSE_EXCEPTION_ID, 4));
    public static TableInfo CONTAINER_TABLE = new TableInfo("containers", new ColInfo(Cols.CONTAINER_ID, 4, "PRIMARY KEY"), new ColInfo(Cols.FILE_PATH, 12, 1024), new ColInfo(Cols.LENGTH, -5), new ColInfo(Cols.EXTRACT_FILE_LENGTH, -5));
    public static TableInfo PROFILE_TABLE = new TableInfo("profiles", new ColInfo(Cols.ID, 4, "PRIMARY KEY"), new ColInfo(Cols.CONTAINER_ID, 4), new ColInfo(Cols.FILE_NAME, 12, 256), new ColInfo(Cols.MD5, 1, 32), new ColInfo(Cols.LENGTH, -5), new ColInfo(Cols.IS_EMBEDDED, 16), new ColInfo(Cols.EMBEDDED_DEPTH, 4), new ColInfo(Cols.FILE_EXTENSION, 12, 12), new ColInfo(Cols.MIME_ID, 4), new ColInfo(Cols.ELAPSED_TIME_MILLIS, 4), new ColInfo(Cols.NUM_ATTACHMENTS, 4), new ColInfo(Cols.NUM_METADATA_VALUES, 4), new ColInfo(Cols.NUM_PAGES, 4), new ColInfo(Cols.HAS_CONTENT, 16));
    public static TableInfo EMBEDDED_FILE_PATH_TABLE = new TableInfo("emb_file_names", new ColInfo(Cols.ID, 4, "PRIMARY KEY"), new ColInfo(Cols.EMBEDDED_FILE_PATH, 12, 1024));
    public static TableInfo CONTENTS_TABLE = new TableInfo("contents", new ColInfo(Cols.ID, 4, "PRIMARY KEY"), new ColInfo(Cols.CONTENT_LENGTH, 4), new ColInfo(Cols.NUM_UNIQUE_TOKENS, 4), new ColInfo(Cols.NUM_TOKENS, 4), new ColInfo(Cols.COMMON_TOKENS_LANG, 12, 12), new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, 4), new ColInfo(Cols.NUM_COMMON_TOKENS, 4), new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, 4), new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, 4), new ColInfo(Cols.TOP_N_TOKENS, 12, 1024), new ColInfo(Cols.LANG_ID_1, 12, 12), new ColInfo(Cols.LANG_ID_PROB_1, 6), new ColInfo(Cols.LANG_ID_2, 12, 12), new ColInfo(Cols.LANG_ID_PROB_2, 6), new ColInfo(Cols.UNICODE_CHAR_BLOCKS, 12, 1024), new ColInfo(Cols.TOKEN_ENTROPY_RATE, 6), new ColInfo(Cols.TOKEN_LENGTH_SUM, 4), new ColInfo(Cols.TOKEN_LENGTH_MEAN, 6), new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, 6), new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, 16));
    public static TableInfo TAGS_TABLE = new TableInfo("tags", new ColInfo(Cols.ID, 4, "PRIMARY KEY"), new ColInfo(Cols.TAGS_A, 4), new ColInfo(Cols.TAGS_B, 4), new ColInfo(Cols.TAGS_DIV, 4), new ColInfo(Cols.TAGS_I, 4), new ColInfo(Cols.TAGS_IMG, 4), new ColInfo(Cols.TAGS_LI, 4), new ColInfo(Cols.TAGS_OL, 4), new ColInfo(Cols.TAGS_P, 4), new ColInfo(Cols.TAGS_TABLE, 4), new ColInfo(Cols.TAGS_TD, 4), new ColInfo(Cols.TAGS_TITLE, 4), new ColInfo(Cols.TAGS_TR, 4), new ColInfo(Cols.TAGS_U, 4), new ColInfo(Cols.TAGS_UL, 4), new ColInfo(Cols.TAGS_PARSE_EXCEPTION, 16));
    static Options OPTIONS;
    private final Path inputDir;
    private final Path extracts;
    private final ExtractReader extractReader;

    public ExtractProfiler(ArrayBlockingQueue<FileResource> queue, Path inputDir, Path extracts, ExtractReader extractReader, IDBWriter dbWriter) {
        super(queue, dbWriter);
        this.inputDir = inputDir;
        this.extracts = extracts;
        this.extractReader = extractReader;
    }

    public static void USAGE() {
        HelpFormatter helpFormatter = new HelpFormatter();
        helpFormatter.printHelp(80, "java -jar tika-eval-x.y.jar Profile -extracts extracts -db mydb [-inputDir input]", "Tool: Profile", OPTIONS, "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
    }

    @Override
    public boolean processFileResource(FileResource fileResource) {
        Metadata metadata = fileResource.getMetadata();
        EvalFilePaths fps = null;
        fps = this.inputDir != null && this.inputDir.equals(this.extracts) ? this.getPathsFromExtractCrawl(metadata, this.extracts) : this.getPathsFromSrcCrawl(metadata, this.inputDir, this.extracts);
        int containerId = ID.incrementAndGet();
        String containerIdString = Integer.toString(containerId);
        ExtractReaderException.TYPE extractExceptionType = null;
        List<Metadata> metadataList = null;
        try {
            metadataList = this.extractReader.loadExtract(fps.getExtractFile());
        }
        catch (ExtractReaderException e) {
            extractExceptionType = e.getType();
        }
        HashMap<Cols, String> contOutput = new HashMap<Cols, String>();
        long srcFileLen = this.getSourceFileLength(fps, metadataList);
        contOutput.put(Cols.LENGTH, srcFileLen > -1L ? Long.toString(srcFileLen) : "");
        contOutput.put(Cols.CONTAINER_ID, containerIdString);
        contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString());
        if (fps.getExtractFileLength() > 0L) {
            contOutput.put(Cols.EXTRACT_FILE_LENGTH, fps.getExtractFile() == null ? "" : Long.toString(fps.getExtractFileLength()));
        }
        try {
            this.writer.writeRow(CONTAINER_TABLE, contOutput);
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
        if (extractExceptionType != null) {
            try {
                this.writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString, fps.getRelativeSourceFilePath().toString(), extractExceptionType);
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
            return true;
        }
        List<Integer> numAttachments = ExtractProfiler.countAttachments(metadataList);
        int i = 0;
        for (Metadata m : metadataList) {
            ContentTags contentTags = ExtractProfiler.getContent(fps, m);
            String fileId = i == 0 ? containerIdString : Integer.toString(ID.incrementAndGet());
            this.writeTagData(fileId, contentTags, TAGS_TABLE);
            this.writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
            this.writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
            this.writeExceptionData(fileId, m, EXCEPTION_TABLE);
            try {
                Map<Class, Object> textStats = this.calcTextStats(contentTags);
                this.writeContentData(fileId, textStats, CONTENTS_TABLE);
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
            ++i;
        }
        return true;
    }

    private void writeEmbeddedPathData(int i, String fileId, Metadata m, TableInfo embeddedFilePathTable) {
        if (i == 0) {
            return;
        }
        HashMap<Cols, String> data = new HashMap<Cols, String>();
        data.put(Cols.ID, fileId);
        data.put(Cols.EMBEDDED_FILE_PATH, m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
        try {
            this.writer.writeRow(embeddedFilePathTable, data);
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    static {
        Option extracts = new Option("extracts", true, "directory for extract files");
        extracts.setRequired(true);
        Option inputDir = new Option("inputDir", true, "optional: directory for original binary input documents. If not specified, -extracts is crawled as is.");
        OPTIONS = new Options().addOption(extracts).addOption(inputDir).addOption("bc", "optional: tika-batch config file").addOption("numConsumers", true, "optional: number of consumer threads").addOption(new Option("alterExtract", true, "for json-formatted extract files, process full metadata list ('as_is'=default), take just the first/container document ('first_only'), concatenate all content into the first metadata item ('concatenate_content')")).addOption("minExtractLength", true, "minimum extract length to process (in bytes)").addOption("maxExtractLength", true, "maximum extract length to process (in bytes)").addOption("db", true, "db file to which to write results").addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>").addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver").addOption("tablePrefix", true, "EXPERT: optional prefix for table names").addOption("drop", false, "drop tables if they exist").addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler").addOption("maxTokens", true, "maximum tokens to process, default=200000").addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000").addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000").addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result");
    }
}

