package net.fred.feedex.utils;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Pattern;
import net.fred.feedex.Constants;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/* loaded from: classes.dex */
public class ArticleTextExtractor {
    private Pattern NEGATIVE;
    private Pattern POSITIVE;
    private Pattern UNLIKELY;
    private String negativeStr;
    private String positiveStr;
    private String unlikelyStr;
    private static final Pattern NODES = Pattern.compile("p|div|td|h1|h2|article|section");
    private static final Pattern NEGATIVE_STYLE = Pattern.compile("hidden|display: ?none|font-size: ?small");

    public ArticleTextExtractor() {
        setUnlikely("com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|header|menu|re(mark|ply)|rss|sh(are|outbox)|sponsora(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|login|si(debar|gn|ngle)");
        setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))|arti(cle|kel)|instapaper_body");
        setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
    }

    private int calcWeight(Element element) {
        int i = this.POSITIVE.matcher(element.className()).find() ? 0 + 35 : 0;
        if (this.POSITIVE.matcher(element.id()).find()) {
            i += 40;
        }
        if (this.UNLIKELY.matcher(element.className()).find()) {
            i -= 20;
        }
        if (this.UNLIKELY.matcher(element.id()).find()) {
            i -= 20;
        }
        if (this.NEGATIVE.matcher(element.className()).find()) {
            i -= 50;
        }
        if (this.NEGATIVE.matcher(element.id()).find()) {
            i -= 50;
        }
        String attr = element.attr("style");
        return (attr == null || attr.isEmpty() || !NEGATIVE_STYLE.matcher(attr).find()) ? i : i - 50;
    }

    private int calcWeightForChild(Element element, String str) {
        if (count(str, Constants.HTML_QUOT) + count(str, Constants.HTML_LT) + count(str, Constants.HTML_GT) + count(str, "px") > 5) {
            return -30;
        }
        return (int) Math.round(str.length() / 25.0d);
    }

    public static int count(String str, String str2) {
        int indexOf = str.indexOf(str2);
        if (indexOf < 0) {
            return 0;
        }
        int i = 0 + 1;
        return count(str.substring(str2.length() + indexOf), str2) + 1;
    }

    private Document removeScriptsAndStyles(Document document) {
        Iterator<Element> it = document.getElementsByTag("script").iterator();
        while (it.hasNext()) {
            it.next().remove();
        }
        Iterator<Element> it2 = document.getElementsByTag("noscript").iterator();
        while (it2.hasNext()) {
            it2.next().remove();
        }
        Iterator<Element> it3 = document.getElementsByTag("style").iterator();
        while (it3.hasNext()) {
            it3.next().remove();
        }
        return document;
    }

    public ArticleTextExtractor addNegative(String str) {
        setNegative(this.negativeStr + "|" + str);
        return this;
    }

    public ArticleTextExtractor addPositive(String str) {
        return setPositive(this.positiveStr + "|" + str);
    }

    public ArticleTextExtractor addUnlikely(String str) {
        return setUnlikely(this.unlikelyStr + "|" + str);
    }

    public String extractContent(String str) throws Exception {
        if (str.isEmpty()) {
            throw new IllegalArgumentException("html string is empty!?");
        }
        return extractContent(Jsoup.parse(str));
    }

    public String extractContent(Document document) throws Exception {
        if (document == null) {
            throw new NullPointerException("missing document");
        }
        prepareDocument(document);
        int i = 0;
        Element element = null;
        for (Element element2 : getNodes(document)) {
            int weight = getWeight(element2);
            if (weight > i) {
                i = weight;
                element = element2;
                if (i > 200) {
                    break;
                }
            }
        }
        if (element != null) {
            return element.toString();
        }
        return null;
    }

    public Collection<Element> getNodes(Document document) {
        HashSet hashSet = new HashSet(64);
        Iterator<Element> it = document.select("body").select("*").iterator();
        while (it.hasNext()) {
            Element next = it.next();
            if (NODES.matcher(next.tagName()).matches()) {
                hashSet.add(next);
            }
        }
        return hashSet;
    }

    protected int getWeight(Element element) {
        return calcWeight(element) + ((int) Math.round((element.ownText().length() / 100.0d) * 10.0d)) + weightChildNodes(element);
    }

    protected void prepareDocument(Document document) {
        removeScriptsAndStyles(document);
    }

    public ArticleTextExtractor setNegative(String str) {
        this.negativeStr = str;
        this.NEGATIVE = Pattern.compile(str);
        return this;
    }

    public ArticleTextExtractor setPositive(String str) {
        this.positiveStr = str;
        this.POSITIVE = Pattern.compile(str);
        return this;
    }

    public ArticleTextExtractor setUnlikely(String str) {
        this.unlikelyStr = str;
        this.UNLIKELY = Pattern.compile(str);
        return this;
    }

    protected int weightChildNodes(Element element) {
        int i = 0;
        Element element2 = null;
        ArrayList arrayList = new ArrayList(5);
        Iterator<Element> it = element.children().iterator();
        while (it.hasNext()) {
            Element next = it.next();
            String ownText = next.ownText();
            int length = ownText.length();
            if (length >= 20) {
                if (length > 200) {
                    i += Math.max(50, length / 10);
                }
                if (next.tagName().equals("h1") || next.tagName().equals("h2")) {
                    i += 30;
                } else if (next.tagName().equals("div") || next.tagName().equals("p")) {
                    i += calcWeightForChild(next, ownText);
                    if (next.tagName().equals("p") && length > 50) {
                        arrayList.add(next);
                    }
                    if (next.className().toLowerCase().equals("caption")) {
                        element2 = next;
                    }
                }
            }
        }
        if (element2 != null) {
            i += 30;
        }
        if (arrayList.size() >= 2) {
            Iterator<Element> it2 = element.children().iterator();
            while (it2.hasNext()) {
                if ("h1;h2;h3;h4;h5;h6".contains(it2.next().tagName())) {
                    i += 20;
                }
            }
        }
        return i;
    }
}
