百度空间 | 百度首页 
               
 
查看文章
 
精确设置NekoHTML生成DOM树形
2008-05-11 08:27
NekoHTML是目前最好的HTML转化成XML的工具之一,但它的Filter功能还是非常有限,比如它目前提供的ElementRemover仅支持对Element的accept和remove功能,假如我想生成一个仅由Element和Text结点构成的DOM树(Element去除"style"和"script"),则只能自己写一个ElementRemover来实现。下面是自己写的一个实现:

import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.cyberneko.html.filters.ElementRemover;

public class MdrElementRemover extends ElementRemover {
    protected int fNonElementDepth = 0;

    // since Xerces-J 2.2.0

    /** Start prefix mapping. */
    public void startPrefixMapping(String prefix, String uri, Augmentations augs)
        throws XNIException {
        fNonElementDepth++;
    }

    /** End prefix mapping. */
    public void endPrefixMapping(String prefix, Augmentations augs)
        throws XNIException {
        fNonElementDepth--;
    }

    //
    // Protected methods
    //
   
    /** Start element. */
    public void startElement(QName element, XMLAttributes attributes, Augmentations augs)
        throws XNIException {
        if (fElementDepth <= fRemovalElementDepth && handleOpenTag(element, attributes)) {
            super.startElement(element, attributes, augs);
        }
        fElementDepth++;
    }

    /** Empty element. */
    public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs)
        throws XNIException {
        if (fElementDepth <= fRemovalElementDepth && handleOpenTag(element, attributes)) {
            super.emptyElement(element, attributes, augs);
        }
    }

    /** Comment. */
    public void comment(XMLString text, Augmentations augs)
        throws XNIException {}

    /** Processing instruction. */
    public void processingInstruction(String target, XMLString data, Augmentations augs)
        throws XNIException {}

    /** Characters. */
    public void characters(XMLString text, Augmentations augs)
        throws XNIException {
        if (fElementDepth <= fRemovalElementDepth && fNonElementDepth <= 0) {
            if(text.toString().trim().length > 0) {
               super.characters(text, augs);
           }
        }
    }

    /** Ignorable whitespace. */
    public void ignorableWhitespace(XMLString text, Augmentations augs)
        throws XNIException {}

    /** Start general entity. */
    public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs)
        throws XNIException {
        fNonElementDepth++;
    }

    /** Text declaration. */
    public void textDecl(String version, String encoding, Augmentations augs)
        throws XNIException {}

    /** End general entity. */
    public void endGeneralEntity(String name, Augmentations augs)
        throws XNIException {
        fNonElementDepth--;
    }

    /** Start CDATA section. */
    public void startCDATA(Augmentations augs) throws XNIException {
        fNonElementDepth++;
    }

    /** End CDATA section. */
    public void endCDATA(Augmentations augs) throws XNIException {
        fNonElementDepth--;
    }

    /** End element. */
    public void endElement(QName element, Augmentations augs)
        throws XNIException {
        if (fElementDepth <= fRemovalElementDepth && elementAccepted(element.rawname)) {
            super.endElement(element, augs);
        }
        fElementDepth--;
        if (fElementDepth == fRemovalElementDepth) {
            fRemovalElementDepth = Integer.MAX_VALUE;
        }
    }

   

    /** Returns true if the specified element is accepted. */
    protected boolean elementAccepted(String element) {
        return true;
    }

    /** Returns true if the specified element should be removed. */
    protected boolean elementRemoved(String element) {
        Object key = element.toLowerCase();
        if (key.equals("style") || key.equals("script")) {
            return true;
        } else {
            return false;
        }
    }

    /** Handles an open tag. */
    protected boolean handleOpenTag(QName element, XMLAttributes attributes) {
        if (elementRemoved(element.rawname)) {
            fRemovalElementDepth = fElementDepth;
            return false;
        }
        return true;
    }
}

以后在构造DomParser时,采用如下代码即可:

XMLDocumentFilter[] filters = { new MdrElementRemover() };
DomParser parser = new DomParser();
domParser.setProperty("http://cyberneko.org/html/properties/filters", filters);

类别:Hacks | 添加到搜藏 | 浏览() | 评论 (0)
 
最近读者:
 
网友评论:
发表评论:
姓 名:
网址或邮箱: (选填)
内 容:
验证码: 请点击后输入四位验证码,字母不区分大小写
      

     

©2009 Baidu