java解析xml

Java解析xml

遇到一个问题,要解析一个xml,网上搜了搜,都说有4中方式,试了试dom解析,感觉解析的时候开发效率太低,忽然想到Jsoup,然后就用了第5种方式Jsoup解析XML。
用Jsoup解析XML,开发效率确实是高,但是运行效率太低了。解析一个10K左右的xml要0.2s左右。300万的xml文件要解析到什么时候呀。
然后试了试Dom解析xml,效率提高了不少,解析一个10K左右的xml 0.05s左右,效率提高的不少。
当然,还有其他3种方式解析。知道Sax解析时占用内存小,可能会快一点,但是着急处理文件,暂时没有测试。

DOM SAX JDOM DOM4J Jsoup


import javax.xml.parsers.DocumentBuilderFactory;

import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;

import com.xxxx.usdp.odk.common.file.FileUtil;
import com.xxxx.usdp.xxxx.poc.yuyin.entity.XmlEntity;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.*;
import org.xml.sax.SAXException;

/**
 * DOM方式解析xml
 *
 * @version V1.0
 */

public class DomParserXml {
    
    private static final Logger log = LoggerFactory.getLogger(DomParserXml.class);
    
    /**
     * 测试解析
     */
    @Test
    public void testPaser(){
        String xmlFilePath = "src/main/resources/2018010109013384362390728_1522286730680.xml";
        xmlFilePath = "C:\\home\\user1\\xxxx\\2018010109000672062385406";
        //xmlFilePath = "src/main/resources/test.xml";
        XmlEntity xmlEntity = parserXml(xmlFilePath);
        log.info("xml数据:\r\n{}", xmlEntity);
    }
    
    /**
     * 批量解析
     */
    @Test
    public void batchParser(){
        String filePath = "D:\\data\\210_1\\210_test";
        String outDirPath = "D:\\data\\210_1\\210_201801_result";
        File outDir = new File(outDirPath);
        if(!outDir.exists()){
            outDir.mkdirs();
        }
        
        File dir = new File(filePath);
        File[] files = dir.listFiles();
        int length = files.length;
        for(int i = 0; i < 1000; i++){
            File f = files[i];
            XmlEntity xmlEntity = parserXml(f.getAbsolutePath());
            try {
                FileUtil.writeStringToFile(xmlEntity.getMix(), outDirPath+"/"+xmlEntity.getFileName()+".txt");
            } catch (IOException e) {
                log.error("写文件出错 {}", e.toString());
            }
        }
    }
    
    /**
     * 把xml解析成对话格式
     *
     * @param xmlFilePath
     */
    public static XmlEntity parserXml(String xmlFilePath) {
        return parserXml(new File(xmlFilePath));
    }
    
    
    /**
     * 把xml解析成对话格式
     *
     * @param f
     */
    public static XmlEntity parserXml(File f) {
        
        long t1 = System.currentTimeMillis();
        long t2 = 0;
        //1、创建一个DocumentBuilderFactory的对象
        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
        //2、创建一个DocumentBuilder的对象
        Document document = null;
        XmlEntity xmlEntity = new XmlEntity();
    
        try {
            //创建DocumentBuilder对象
            DocumentBuilder db = dbf.newDocumentBuilder();
            //3、通过DocumentBuilder对象的parser方法加载books.xml文件到当前项目下
            /*注意导入Document对象时,要导入org.w3c.dom.Document包下的*/
            //传入文件名可以是相对路径也可以是绝对路径
            //document = db.parse(xmlFilePath);
            document = db.parse(f);
            xmlEntity.setFileName(f.getName().replace(".xml", ""));
            
            t2 = System.currentTimeMillis();
            log.info("读文件用时{}s", 1.0*(t2-t1)/1000);
        } catch (ParserConfigurationException e) {
            log.error("Dom解析Xml出错 {}", e.toString());
        } catch (SAXException e) {
            log.error("Dom解析Xml出错 {}", e.toString());
        } catch (IOException e) {
            log.error("Dom解析Xml出错 {}", e.toString());
        }
    
        
        Element instance = (Element) document.getElementsByTagName("instance").item(0);
        // 文件保存地址
        String waveuri = instance.getAttribute("waveuri");
        log.debug("waveuri:{}",waveuri);
        xmlEntity.setWaveuri(waveuri);
        String duration = instance.getAttribute("duration");
        log.debug("duration:{}",duration);
        xmlEntity.setDuration(duration);
        
        NodeList subjectNodes = document.getElementsByTagName("subject");
        if(subjectNodes == null || subjectNodes.getLength() < 2){
            log.error("文件格式错误,subject节点个数小于2个");
            return null;
        }
        log.debug("subject节点个数:{}", subjectNodes.getLength());
        
        
        /** 处理正文和时间片 */
        Element subject1 = (Element) subjectNodes.item(1);
        
        NodeList channels = subject1.getElementsByTagName("channel");
        log.debug("channels 节点个数:{}", channels.getLength());
        
        // channel0 n0
        Element c1 = (Element) channels.item(0);
        String tagname = c1.getTagName();
        log.debug("tagname:{}" ,tagname);
        Element textElementA = (Element) c1.getElementsByTagName("text").item(0);
        Element timeElementA = (Element) c1.getElementsByTagName("time").item(0);
        String textA = textElementA.getTextContent().trim();
        log.debug("textA:|{}|", textA);
        xmlEntity.setN0(textA);
        String timeA = timeElementA.getTextContent().trim();
        log.debug("timeA:|{}|", timeA);
        String[] textArrayA = textA.split(" ");
        String[] timeArrayA = timeA.split(" ");
        int textLengthA = textArrayA.length;
        log.debug("textLengthA:{}", textLengthA);
    
    
        // channel1  n1
        Element c2 = (Element) channels.item(1);
        String tagname2 = c2.getTagName();
        log.debug("tagname2:{}" ,tagname2);
        Element textElementB = (Element) c2.getElementsByTagName("text").item(0);
        Element timeElementB = (Element) c2.getElementsByTagName("time").item(0);
        String textB = textElementB.getTextContent().trim();
        log.debug("textB:|{}|", textB);
        xmlEntity.setN1(textB);
        String timeB = timeElementB.getTextContent().trim();
        log.debug("timeB:|{}|", timeB);
    
        String[] textArrayB = textB.split(" ");
        String[] timeArrayB = timeB.split(" ");
        int textLengthB = textArrayB.length;
        log.debug("textLengthB:{}", textLengthB);
    
        String n0 = "n0";
        String n1 = "n1";
        List<TimeTextEntity> timeTextList = new ArrayList<>(textLengthA +textLengthB);
        
        if(textLengthA > 1){
            // A
            for(int i = 0; i < textLengthA; i++){
                // 一个词语
                String oneTerm = textArrayA[i];
                // 时间片
                String oneTime = timeArrayA[i];
                String[] timeArraySub = oneTime.split(",");
                int start = Integer.parseInt(timeArraySub[0]);
                int end = Integer.parseInt(timeArraySub[1]);
                TimeTextEntity t = new TimeTextEntity(start, end, oneTerm, n0);
                timeTextList.add(t);
            }
        }
    
    
        if(textLengthB >1){
            // B
            for(int i =0; i <textLengthB; i++){
                // 一个词语
                String oneTerm = textArrayB[i];
                // 时间片
                String oneTime = timeArrayB[i];
                String[] timeArraySub = oneTime.split(",");
                int start = Integer.parseInt(timeArraySub[0]);
                int end = Integer.parseInt(timeArraySub[1]);
                TimeTextEntity t = new TimeTextEntity(start, end, oneTerm, n1);
                timeTextList.add(t);
            }
        }
    
        long t4 = System.currentTimeMillis();
        // 升序
        Collections.sort(timeTextList, new Comparator<TimeTextEntity>() {
            @Override
            public int compare(TimeTextEntity o1, TimeTextEntity o2) {
                return new Integer(o1.getStart()).compareTo(o2.getStart());
            }
        });
        long t5 = System.currentTimeMillis();
        log.info("排序用时{}s", 1.0*(t5-t4)/1000);
    
        int allCount = timeTextList.size();
        StringBuilder sb = new StringBuilder();
        String flag = null;
        for(int i =0; i < allCount; i++){
        
            log.debug("{}  {}", i, timeTextList.get(i));
        
            TimeTextEntity entity = timeTextList.get(i);
            String who = entity.getWho();
            if(who.equals(flag)){
                sb.append(entity.getText());
                sb.append(" ");
            }else{
                sb.append("\r\n");
                flag = who;
                sb.append(flag);
                sb.append(" : ");
                sb.append(entity.getText());
                sb.append(" ");
            }
        
        
        } // end for
        
        xmlEntity.setMix(sb.toString());
        
    
        long t3 = System.currentTimeMillis();
        log.info("解析用时{}s", 1.0*(t3-t2)/1000);
    
        log.info("总共用时{}s", 1.0*(t3-t1)/1000);
        
        log.debug("对话:{}", sb);
    
        return xmlEntity;
    }
    
    
}


/**
 * 时间段对象<br>
 */
class TimeEntity{
    
    private int start;
    private int end;
    
    public TimeEntity(){
    
    }
    
    public TimeEntity(int start, int end){
        this.start = start;
        this.end = end;
    }
    
    public int getStart() {
        return start;
    }
    
    public void setStart(int start) {
        this.start = start;
    }
    
    public int getEnd() {
        return end;
    }
    
    public void setEnd(int end) {
        this.end = end;
    }
    
    @Override
    public String toString() {
        return "TimeEntity{" + "start='" + start + '\'' + ", end='" + end +
            '\'' + '}';
    }
    
}

/**
 *
 */
class TimeTextEntity{
    
    private int start;
    private int end;
    private String text;
    /** n0 n1 */
    private String who;
    
    public TimeTextEntity(){
    
    }
    
    public TimeTextEntity(int start, int end, String text, String who){
        this.start = start;
        this.end = end;
        this.text = text;
        this.who = who;
    }
    
    public int getStart() {
        return start;
    }
    
    public void setStart(int start) {
        this.start = start;
    }
    
    public int getEnd() {
        return end;
    }
    
    public void setEnd(int end) {
        this.end = end;
    }
    
    public String getText() {
        return text;
    }
    
    public void setText(String text) {
        this.text = text;
    }
    
    public String getWho() {
        return who;
    }
    
    public void setWho(String who) {
        this.who = who;
    }
    
    @Override
    public String toString() {
        return "TimeTextEntity{" + "start=" + start + ", end=" + end + ", " +
            "text='" + text + '\'' + ", who='" + who + '\'' + '}';
    }
}