# -*- coding: utf-8 -*- import re import xml.dom.minidom import xml.parsers.expat from xml.dom.minidom import Node from xml.dom.minidom import parseString ''' Note: xml 有5种特殊字符, &<>"' 其一,采用特殊标签,将包含特殊字符的字符串封装起来。 例如: 其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为: & & < < > > " " ' ' 例如: "姓名" ''' class MixTextProcessor(): def __repr__(self): print("@an MixTextProcessor class") def get_xml_content(self, mixstr): '''返回字符串的 xml 内容''' xmlptn = re.compile(r".*?", re.M | re.S) ctn = re.search(xmlptn, mixstr) if ctn: return ctn.group(0) else: return None def get_content_split(self, mixstr): ''' 文本分解,顺序加了列表中,按非xml 和 xml 分开,对应的字符串,带标点符号 不能去除空格,因为xml 中tag 属性带空格 ''' ctlist = [] # print("Testing:",mixstr[:20]) patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) mat = re.match(patn, mixstr) if mat: pre_xml = mat.group(1) in_xml = mat.group(2) after_xml = mat.group(3) ctlist.append(pre_xml) ctlist.append(in_xml) ctlist.append(after_xml) return ctlist else: ctlist.append(mixstr) return ctlist @classmethod def get_pinyin_split(self, mixstr): ctlist = [] patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) mat = re.match(patn, mixstr) if mat: pre_xml = mat.group(1) in_xml = mat.group(2) after_xml = mat.group(3) ctlist.append([pre_xml, []]) dom = DomXml(in_xml) pinyinlist = dom.get_pinyins_for_xml() ctlist = ctlist + pinyinlist ctlist.append([after_xml, []]) else: ctlist.append([mixstr, []]) return ctlist class DomXml(): def __init__(self, xmlstr): print("Parse xml str:", xmlstr) self.tdom = parseString(xmlstr) #Document # print("tdom:",type(self.tdom)) self.root = self.tdom.documentElement #Element # print("root:",type(self.root)) self.rnode = self.tdom.childNodes #NodeList # print("rnode:",type(self.rnode)) pass def get_text(self): '''返回xml 内容的所有文本内容的 列表''' res = [] for x1 in self.rnode: if x1.nodeType == Node.TEXT_NODE: res.append(x1.value) else: for x2 in x1.childNodes: if isinstance(x2, xml.dom.minidom.Text): res.append(x2.data) else: for x3 in x2.childNodes: if isinstance(x3, xml.dom.minidom.Text): res.append(x3.data) else: print("len(nodes of x3):", len(x3.childNodes)) return res def get_xmlchild_list(self): '''返回xml 内容的列表, 包括所有文本内容(不带tag)''' res = [] for x1 in self.rnode: if x1.nodeType == Node.TEXT_NODE: res.append(x1.value) else: for x2 in x1.childNodes: if isinstance(x2, xml.dom.minidom.Text): res.append(x2.data) else: for x3 in x2.childNodes: if isinstance(x3, xml.dom.minidom.Text): res.append(x3.data) else: print("len(nodes of x3):", len(x3.childNodes)) print(res) return res def get_pinyins_for_xml(self): '''返回xml 内容,如果字符串 和 拼音的 list , 如 [''' res = [] for x1 in self.rnode: if x1.nodeType == Node.TEXT_NODE: t = re.sub(r"\s+", "", x1.value) res.append([t, []]) else: for x2 in x1.childNodes: if isinstance(x2, xml.dom.minidom.Text): t = re.sub(r"\s+", "", x2.data) res.append([t, []]) else: # print("x2",x2,x2.tagName) if x2.hasAttribute('pinyin'): pinyin_value = x2.getAttribute("pinyin") pinyins = pinyin_value.split(" ") for x3 in x2.childNodes: # print('x3',x3) if isinstance(x3, xml.dom.minidom.Text): t = re.sub(r"\s+", "", x3.data) res.append([t, pinyins]) else: print("len(nodes of x3):", len(x3.childNodes)) return res def get_all_tags(self, tag_name): '''获取所有的tag 及属性值''' alltags = self.root.getElementsByTagName(tag_name) for x in alltags: if x.hasAttribute('pinyin'): # pinyin print(x.tagName, 'pinyin', x.getAttribute('pinyin'), x.firstChild.data)