package com.gu.util.text.cleaning;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.tidy.Tidy;

/* loaded from: input_file:com/gu/util/text/cleaning/JTidyCleaner.class */
public class JTidyCleaner implements TextCleaner {
    private Pattern bodyPattern = Pattern.compile("<body>\\s*(.*?)\\s*</body>", 34);

    @Override // com.gu.util.text.cleaning.TextCleaner
    public String clean(String str) {
        if (StringUtils.isBlank(str)) {
            return str;
        }
        Tidy tidy = new Tidy();
        tidy.setCharEncoding(3);
        tidy.setXHTML(true);
        tidy.setQuiet(true);
        try {
            Matcher matcher = this.bodyPattern.matcher(getBodyStringFromDocument(tidy.parseDOM(new ByteArrayInputStream(("<body>" + str + "</body>").getBytes("UTF-8")), (OutputStream) null)));
            matcher.find();
            return matcher.group(1);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }

    private String getBodyStringFromDocument(Document document) {
        Element element = (Element) document.getDocumentElement().getElementsByTagName("body").item(0);
        StringWriter stringWriter = new StringWriter();
        OutputFormat outputFormat = new OutputFormat();
        outputFormat.setOmitXMLDeclaration(true);
        try {
            new XMLSerializer(stringWriter, outputFormat).asDOMSerializer().serialize(element);
            return stringWriter.toString();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}
