package com.gu.util.text.cleaning;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.ccil.cowan.tagsoup.Parser;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.input.SAXHandler;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.xml.sax.InputSource;

/* loaded from: input_file:com/gu/util/text/cleaning/TagSoupHtmlCleaner.class */
public class TagSoupHtmlCleaner implements TextCleaner {
    private Pattern bodyPattern = Pattern.compile("<body.*?>\\s*(.*?)\\s*</body>", 34);

    @Override // com.gu.util.text.cleaning.TextCleaner
    public String clean(String str) {
        Parser parser = new Parser();
        SAXHandler sAXHandler = new SAXHandler();
        parser.setContentHandler(sAXHandler);
        try {
            parser.parse(inputSourceFor(str));
            return getInternalTextFromBodyElement(sAXHandler.getDocument());
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    private String getInternalTextFromBodyElement(Document document) throws IOException {
        Element rootElement = document.getRootElement();
        XMLOutputter xMLOutputter = new XMLOutputter(Format.getRawFormat().setExpandEmptyElements(true));
        StringWriter stringWriter = new StringWriter();
        xMLOutputter.outputElementContent(rootElement, stringWriter);
        Matcher matcher = this.bodyPattern.matcher(stringWriter.toString());
        matcher.matches();
        return matcher.group(1);
    }

    private InputSource inputSourceFor(String str) {
        return new InputSource(new StringReader(str));
    }
}
