使用NekoHtml处理网页(删除Style标签)
生活随笔
收集整理的這篇文章主要介紹了
使用NekoHtml处理网页(删除Style标签)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
最近在做一個手機APP,通過一個新聞抓取程序抓取新聞,然后通過APP展示新聞。后發現手機端不支持Style標簽,如果網頁中有Style標簽,則標簽內的內容會顯示處理非常影響頁面美觀。于是就寫了一個用NekoHTML來清除Style標簽的工具類
html.filter.properties 配置文件,配置允許的標簽和要刪除的標簽及標簽內的屬性
PropertiesUtils 讀取Properties
package com.tiamaes.gjds.util;import java.io.IOException; import java.util.Properties;import org.springframework.core.io.ClassPathResource;/** * <p>類描述: 讀取Properties中的屬性 </p>* <p>創建人:王成委 </p>* <p>創建時間:2015年1月28日 上午11:23:27 </p>* <p>版權說明: ? 2015 Tiamaes </p>*/ public class PropertiesUtils {private Properties properties;public PropertiesUtils(String path){try {ClassPathResource resource = new ClassPathResource(path);properties = new Properties();properties.load(resource.getInputStream());} catch (IOException e) {e.printStackTrace();}}public String get(String key){return this.properties.getProperty(key);}}
過濾HTML中的標簽 package com.tiamaes.gjds.util;import java.io.CharArrayReader; import java.io.StringWriter; import java.util.ArrayList; import java.util.List;import org.apache.xerces.xni.parser.XMLDocumentFilter; import org.cyberneko.html.filters.ElementRemover; import org.cyberneko.html.filters.Writer; import org.cyberneko.html.parsers.DOMParser; import org.xml.sax.InputSource;/** * <p>類描述: 過濾Html中的標簽 </p>* <p>創建人:王成委 </p>* <p>創建時間:2015年1月29日 上午10:45:02 </p>* <p>版權說明: ? 2015 Tiamaes </p>*/ public class HtmlFilterUtils {private static PropertiesUtils properties = null;private static HtmlFilterUtils filter = null;private String configPath = "html.filter.properties";private static final String ATTRIBUTE_FIELD = "attributes";private static final String ACCEPT_TAGS_FIELD = "acceptTags";private static final String REMOVE_TAGS_FIELD = "removeTags";private List<String> attributes = new ArrayList<String>();private List<String> acceptTags = new ArrayList<String>();private List<String> removeTags = new ArrayList<String>();private static synchronized void syncInit(){if(filter == null)filter = new HtmlFilterUtils();}public static HtmlFilterUtils getInstance(){return getInstance(false);}public static HtmlFilterUtils getInstance(boolean createNew){if(createNew)return new HtmlFilterUtils();if(filter == null){syncInit();}return filter;}private HtmlFilterUtils(){if(properties == null){properties = new PropertiesUtils(configPath);}this.addToList(attributes, properties.get(ATTRIBUTE_FIELD));this.addToList(acceptTags, properties.get(ACCEPT_TAGS_FIELD));this.addToList(removeTags, properties.get(REMOVE_TAGS_FIELD));}public void addAtributes(String attrName){this.attributes.add(attrName);}public void removeAtributes(String attrName){this.attributes.remove(attrName);}public void addRmoveTag(String tagName){this.removeTags.add(tagName);}public void removeRmoveTag(String tagName){this.removeTags.remove(tagName);}public void addAcceptTag(String tagName){this.acceptTags.add(tagName);}public void removeAcceptTag(String tagName){this.acceptTags.remove(tagName);}private void addToList(List<String> list,String sources){if(list == null) list = new ArrayList<String>();String[] sourcesArray = sources.split(",");for(String str:sourcesArray){list.add(str);}}public String doFilter(String htmlCode){ElementRemover remover = new ElementRemover();String[] atrrs = new String[attributes.size()];for(String tag : acceptTags)remover.acceptElement(tag,attributes.toArray(atrrs));for(String tag : removeTags)remover.removeElement(tag);CharArrayReader reader = null;String result;try {StringWriter filteredDescription = new StringWriter();Writer writer = new Writer(filteredDescription,"UTF-8");XMLDocumentFilter[] filters = {remover,writer};DOMParser parser = new DOMParser();reader = new CharArrayReader(htmlCode.toCharArray());InputSource inputSource = new InputSource(reader);parser.setProperty("http://cyberneko.org/html/properties/filters", filters);parser.parse(inputSource);result = filteredDescription.toString();} catch (Exception e1) {e1.printStackTrace();result = htmlCode;}try {reader.close();} catch (Exception e) {e.printStackTrace();}return result;} } 調用doFilter可以過濾HTML的內容
總結
以上是生活随笔為你收集整理的使用NekoHtml处理网页(删除Style标签)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: [html] 制作页面时,前端如何适应
- 下一篇: [html] 如何在IOS下启用Web