package de.l3s.boilerpipe.sax;

import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.labels.LabelAction;
import de.l3s.boilerpipe.util.UnicodeTokenizer;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;

/* JADX WARN: Classes with same name are omitted:
  input_file:WEB-INF/lib/boilerpipe-1.1.0.jar:de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.class
 */
/* loaded from: input_file:WEB-INF/lib/tika-app-1.3.jar:de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.class */
public class BoilerpipeHTMLContentHandler implements ContentHandler {
    private final Map<String, TagAction> tagActions;
    private String title;
    static final String ANCHOR_TEXT_START = "$\ue00a<";
    static final String ANCHOR_TEXT_END = ">\ue00a$";
    StringBuilder tokenBuffer;
    StringBuilder textBuffer;
    int inBody;
    int inAnchor;
    int inIgnorableElement;
    boolean sbLastWasWhitespace;
    private int textElementIdx;
    private final List<TextBlock> textBlocks;
    private String lastStartTag;
    private String lastEndTag;
    private Event lastEvent;
    private int offsetBlocks;
    private BitSet currentContainedTextElements;
    private boolean flush;
    boolean inAnchorText;
    LinkedList<LabelAction> labelStack;
    LinkedList<Integer> fontSizeStack;
    private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern.compile("[\\p{L}\\p{Nd}\\p{Nl}\\p{No}]");

    /* JADX WARN: Classes with same name are omitted:
      input_file:WEB-INF/lib/boilerpipe-1.1.0.jar:de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler$Event.class
     */
    /* loaded from: input_file:WEB-INF/lib/tika-app-1.3.jar:de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler$Event.class */
    private enum Event {
        START_TAG,
        END_TAG,
        CHARACTERS,
        WHITESPACE
    }

    public void recycle() {
        this.tokenBuffer.setLength(0);
        this.textBuffer.setLength(0);
        this.inBody = 0;
        this.inAnchor = 0;
        this.inIgnorableElement = 0;
        this.sbLastWasWhitespace = false;
        this.textElementIdx = 0;
        this.textBlocks.clear();
        this.lastStartTag = null;
        this.lastEndTag = null;
        this.lastEvent = null;
        this.offsetBlocks = 0;
        this.currentContainedTextElements.clear();
        this.flush = false;
        this.inAnchorText = false;
    }

    public BoilerpipeHTMLContentHandler() {
        this(DefaultTagActionMap.INSTANCE);
    }

    public BoilerpipeHTMLContentHandler(TagActionMap tagActionMap) {
        this.title = null;
        this.tokenBuffer = new StringBuilder();
        this.textBuffer = new StringBuilder();
        this.inBody = 0;
        this.inAnchor = 0;
        this.inIgnorableElement = 0;
        this.sbLastWasWhitespace = false;
        this.textElementIdx = 0;
        this.textBlocks = new ArrayList();
        this.lastStartTag = null;
        this.lastEndTag = null;
        this.lastEvent = null;
        this.offsetBlocks = 0;
        this.currentContainedTextElements = new BitSet();
        this.flush = false;
        this.inAnchorText = false;
        this.labelStack = new LinkedList<>();
        this.fontSizeStack = new LinkedList<>();
        this.tagActions = tagActionMap;
    }

    @Override // org.xml.sax.ContentHandler
    public void endDocument() throws SAXException {
        flushBlock();
    }

    @Override // org.xml.sax.ContentHandler
    public void endPrefixMapping(String str) throws SAXException {
    }

    @Override // org.xml.sax.ContentHandler
    public void ignorableWhitespace(char[] cArr, int i, int i2) throws SAXException {
        if (!this.sbLastWasWhitespace) {
            this.textBuffer.append(' ');
            this.tokenBuffer.append(' ');
        }
        this.sbLastWasWhitespace = true;
    }

    @Override // org.xml.sax.ContentHandler
    public void processingInstruction(String str, String str2) throws SAXException {
    }

    @Override // org.xml.sax.ContentHandler
    public void setDocumentLocator(Locator locator) {
    }

    @Override // org.xml.sax.ContentHandler
    public void skippedEntity(String str) throws SAXException {
    }

    @Override // org.xml.sax.ContentHandler
    public void startDocument() throws SAXException {
    }

    @Override // org.xml.sax.ContentHandler
    public void startPrefixMapping(String str, String str2) throws SAXException {
    }

    @Override // org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        TagAction tagAction = this.tagActions.get(str2);
        if (tagAction != null) {
            this.flush = tagAction.start(this, str2, str3, attributes) | this.flush;
        } else {
            this.flush = true;
        }
        this.lastEvent = Event.START_TAG;
        this.lastStartTag = str2;
    }

    @Override // org.xml.sax.ContentHandler
    public void endElement(String str, String str2, String str3) throws SAXException {
        TagAction tagAction = this.tagActions.get(str2);
        if (tagAction != null) {
            this.flush = tagAction.end(this, str2, str3) | this.flush;
        } else {
            this.flush = true;
        }
        this.lastEvent = Event.END_TAG;
        this.lastEndTag = str2;
    }

    @Override // org.xml.sax.ContentHandler
    public void characters(char[] cArr, int i, int i2) throws SAXException {
        this.textElementIdx++;
        if (this.flush) {
            flushBlock();
            this.flush = false;
        }
        if (this.inIgnorableElement != 0) {
            return;
        }
        boolean z = false;
        boolean z2 = false;
        if (i2 == 0) {
            return;
        }
        int i3 = i + i2;
        for (int i4 = i; i4 < i3; i4++) {
            if (Character.isWhitespace(cArr[i4])) {
                cArr[i4] = ' ';
            }
        }
        while (i < i3 && cArr[i] == ' ') {
            z = true;
            i++;
            i2--;
        }
        while (i2 > 0 && cArr[(i + i2) - 1] == ' ') {
            z2 = true;
            i2--;
        }
        if (i2 == 0) {
            if (z || z2) {
                if (!this.sbLastWasWhitespace) {
                    this.textBuffer.append(' ');
                    this.tokenBuffer.append(' ');
                }
                this.sbLastWasWhitespace = true;
            } else {
                this.sbLastWasWhitespace = false;
            }
            this.lastEvent = Event.WHITESPACE;
            return;
        }
        if (z && !this.sbLastWasWhitespace) {
            this.textBuffer.append(' ');
            this.tokenBuffer.append(' ');
        }
        this.textBuffer.append(cArr, i, i2);
        this.tokenBuffer.append(cArr, i, i2);
        if (z2) {
            this.textBuffer.append(' ');
            this.tokenBuffer.append(' ');
        }
        this.sbLastWasWhitespace = z2;
        this.lastEvent = Event.CHARACTERS;
        this.currentContainedTextElements.set(this.textElementIdx);
    }

    List<TextBlock> getTextBlocks() {
        return this.textBlocks;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public void flushBlock() {
        int i;
        if (this.inBody == 0) {
            if ("TITLE".equalsIgnoreCase(this.lastStartTag) && this.inBody == 0) {
                setTitle(this.tokenBuffer.toString().trim());
            }
            this.textBuffer.setLength(0);
            this.tokenBuffer.setLength(0);
            return;
        }
        switch (this.tokenBuffer.length()) {
            case 0:
                return;
            case 1:
                if (this.sbLastWasWhitespace) {
                    this.textBuffer.setLength(0);
                    this.tokenBuffer.setLength(0);
                    return;
                }
                break;
        }
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        int i5 = -1;
        int i6 = 0;
        int i7 = 0;
        for (String str : UnicodeTokenizer.tokenize(this.tokenBuffer)) {
            if (ANCHOR_TEXT_START.equals(str)) {
                this.inAnchorText = true;
            } else if (ANCHOR_TEXT_END.equals(str)) {
                this.inAnchorText = false;
            } else if (isWord(str)) {
                i6++;
                i2++;
                i7++;
                if (this.inAnchorText) {
                    i3++;
                }
                int length = str.length();
                i5 += length + 1;
                if (i5 > 80) {
                    i4++;
                    i5 = length;
                    i7 = 1;
                }
            } else {
                i6++;
            }
        }
        if (i6 == 0) {
            return;
        }
        if (i4 == 0) {
            i = i2;
            i4 = 1;
        } else {
            i = i2 - i7;
        }
        TextBlock textBlock = new TextBlock(this.textBuffer.toString().trim(), this.currentContainedTextElements, i2, i3, i, i4, this.offsetBlocks);
        this.currentContainedTextElements = new BitSet();
        this.offsetBlocks++;
        this.textBuffer.setLength(0);
        this.tokenBuffer.setLength(0);
        addTextBlock(textBlock);
    }

    protected void addTextBlock(TextBlock textBlock) {
        Iterator<Integer> it = this.fontSizeStack.iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            Integer next = it.next();
            if (next != null) {
                textBlock.addLabel("font-" + next);
                break;
            }
        }
        Iterator<LabelAction> it2 = this.labelStack.iterator();
        while (it2.hasNext()) {
            LabelAction next2 = it2.next();
            if (next2 != null) {
                next2.addTo(textBlock);
            }
        }
        this.textBlocks.add(textBlock);
    }

    private static boolean isWord(String str) {
        return PAT_VALID_WORD_CHARACTER.matcher(str).find();
    }

    public String getTitle() {
        return this.title;
    }

    public void setTitle(String str) {
        if (str == null || str.length() == 0) {
            return;
        }
        this.title = str;
    }

    public TextDocument toTextDocument() {
        flushBlock();
        return new TextDocument(getTitle(), getTextBlocks());
    }

    public void addWhitespaceIfNecessary() {
        if (this.sbLastWasWhitespace) {
            return;
        }
        this.tokenBuffer.append(' ');
        this.textBuffer.append(' ');
        this.sbLastWasWhitespace = true;
    }
}
