分享

正则表达式彻底去除HTML

 pengx 2008-09-05
public static String RMHTML(String Htmlstring)

 {

/**

转载请注明:

PowerBy:Lulu

Www.HotCity.Cn

*/

// 删除脚本

 Htmlstring = RegexPattern("<\\s*?script[^>]*>[\\s\\S]*?<\\s*?/\\s*?script\\s*?>","",Htmlstring);

// 删除HTML

 Htmlstring = RegexPattern("<([^>]*)>", "",Htmlstring);

 Htmlstring = RegexPattern( "([\r\n])[\\s]+", "",Htmlstring);

 Htmlstring = RegexPattern( "-->", "",Htmlstring);

 Htmlstring = RegexPattern( "<!--.*", "",Htmlstring);

 Htmlstring = RegexPattern( "&(quot|#34);", "\"",Htmlstring);

 Htmlstring = RegexPattern( "&(amp|#38);", "&",Htmlstring);

 Htmlstring = RegexPattern( "&(lt|#60);", "<",Htmlstring);

 Htmlstring = RegexPattern( "&(gt|#62);", ">",Htmlstring);

 Htmlstring = RegexPattern( "&(nbsp|#160);", " ",Htmlstring);

 Htmlstring = RegexPattern( "&(iexcl|#161);", "\\xa1",Htmlstring);

 Htmlstring = RegexPattern( "&(cent|#162);", "\\xa2",Htmlstring);

 Htmlstring = RegexPattern( "&(pound|#163);", "\\xa3",Htmlstring);

 Htmlstring = RegexPattern( "&(copy|#169);", "\\xa9",Htmlstring);

 Htmlstring = RegexPattern( "&#(\\d+);", "",Htmlstring);

 

 Htmlstring = RegexPattern("<", "",Htmlstring);

 Htmlstring = RegexPattern(">", "",Htmlstring);

 //Htmlstring.replace("\r\n", "",Htmlstring);

 

 return Htmlstring;

 }
   public static String RegexPattern(String pattern,String str,String content){

       if(pattern!=null && !pattern.equals("")){


           Pattern p = Pattern.compile(pattern,2); //参数2表示大小写不区分

           Matcher m = p.matcher(content);
           content=m.replaceAll(str);
 

       }
       return content;      
   } 

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多