Block based text normalisation

pull/178/head
M66B 5 years ago
parent 7fecf52bb9
commit e0bcd1764a

@ -1776,52 +1776,87 @@ public class HtmlHelper {
int dp6 = Helper.dp2pixels(context, 6); int dp6 = Helper.dp2pixels(context, 6);
if (experiments) { if (experiments) {
// https://developer.android.com/guide/topics/text/spans // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
SpannableStringBuilder ssb = new SpannableStringBuilder();
NodeTraversor.traverse(new NodeVisitor() { NodeTraversor.traverse(new NodeVisitor() {
private int pre = 0;
private Element element;
private List<TextNode> block = new ArrayList<>();
private List<String> BLOCK_START = Collections.unmodifiableList(Arrays.asList(
"body", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6", "li", "ol", "ul", "pre"
));
private List<String> BLOCK_END = Collections.unmodifiableList(Arrays.asList(
"body", "blockquote", "br", "h1", "h2", "h3", "h4", "h5", "h6", "li", "ol", "ul", "pre"
));
@Override @Override
public void head(Node node, int depth) { public void head(Node node, int depth) {
if (node instanceof Element) { if (node instanceof TextNode) {
Element element = (Element) node; if (pre == 0)
element.attr("start-index", Integer.toString(ssb.length())); block.add((TextNode) node);
} else if (node instanceof Element) {
element = (Element) node;
if (BLOCK_START.contains(element.tagName())) {
normalizeText(block);
block.clear();
}
if ("pre".equals(element.tagName()))
pre++;
}
}
boolean pre = false; @Override
Element parent = element.parent(); public void tail(Node node, int depth) {
while (parent != null) { if (node instanceof Element) {
if ("pre".equals(parent.tagName())) { element = (Element) node;
pre = true; if (BLOCK_END.contains(element.tagName())) {
break; normalizeText(block);
} block.clear();
parent = parent.parent();
} }
if ("pre".equals(element.tagName()))
pre--;
}
}
if (!pre) { private void normalizeText(List<TextNode> block) {
// https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace // https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace
List<TextNode> tnodes = getTextNodes(element); TextNode tnode;
for (int i = 0; i < tnodes.size(); i++) { String text;
TextNode tnode = tnodes.get(i); for (int i = 0; i < block.size(); i++) {
String text = tnode.getWholeText(); tnode = block.get(i);
if (TextUtils.isEmpty(text)) text = tnode.getWholeText();
continue; if (TextUtils.isEmpty(text))
continue;
// Remove whitespace before/after newlines // Remove whitespace before/after newlines
text = text.replaceAll("\\s+\\r?\\n\\s+", " "); text = text.replaceAll("\\s+\\r?\\n\\s+", " ");
if (i == 0 || (tnodes.get(i - 1).text().endsWith(" "))) if (i == 0 || (block.get(i - 1).text().endsWith(" ")))
while (text.startsWith(" ")) while (text.startsWith(" "))
text = text.substring(1); text = text.substring(1);
if (i == tnodes.size() - 1) if (i == block.size() - 1)
while (text.endsWith(" ")) while (text.endsWith(" "))
text = text.substring(0, text.length() - 1); text = text.substring(0, text.length() - 1);
tnode.text(text); tnode.text(text);
} }
} }
}, document.body());
// https://developer.android.com/guide/topics/text/spans
SpannableStringBuilder ssb = new SpannableStringBuilder();
NodeTraversor.traverse(new NodeVisitor() {
private Element element;
private TextNode tnode;
@Override
public void head(Node node, int depth) {
if (node instanceof Element) {
element = (Element) node;
element.attr("start-index", Integer.toString(ssb.length()));
} else if (node instanceof TextNode) { } else if (node instanceof TextNode) {
// https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace tnode = (TextNode) node;
TextNode tnode = (TextNode) node;
ssb.append(tnode.text()); ssb.append(tnode.text());
} }
} }
@ -1829,7 +1864,7 @@ public class HtmlHelper {
@Override @Override
public void tail(Node node, int depth) { public void tail(Node node, int depth) {
if (node instanceof Element) { if (node instanceof Element) {
Element element = (Element) node; element = (Element) node;
int start = Integer.parseInt(element.attr("start-index")); int start = Integer.parseInt(element.attr("start-index"));
switch (element.tagName()) { switch (element.tagName()) {
case "a": case "a":
@ -1937,19 +1972,7 @@ public class HtmlHelper {
} }
} }
} }
}, document.body());
List<TextNode> getTextNodes(Element element) {
List<TextNode> result = new ArrayList<>();
for (Node child : element.childNodes())
if (child instanceof TextNode)
result.add((TextNode) child);
else if (child instanceof Element)
result.addAll(getTextNodes((Element) child));
return result;
}
}, document.body().children());
return reverseSpans(ssb); return reverseSpans(ssb);
} else } else

Loading…
Cancel
Save