设为首页 - 加入收藏 核心网 ()- 云主机,资讯,互联网,人工智能,云计算,大赢家论坛,区块链,VR,站长网!
热搜: 2019 统一 潜艇 模式
当前位置: 主页 > 水果奶奶论坛 > 正文

c#中过滤html的正则表达式

发布时间:2021-07-18 01:25 所属栏目:[水果奶奶论坛] 来源:互联网
导读:这篇文章主要为大家介绍c#中过滤html的正则表达式,需要的朋友可以参考一下

/// <summary> /// 去除HTML标记 /// </summary> /// <param name=”NoHTML”>包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> public static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"–>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!–.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase); Htmlstring.Replace("<", ""); Htmlstring.Replace(">", ""); Htmlstring.Replace("\r\n", ""); Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim(); return Htmlstring; }

C#过滤Html标签及空格

public static string FilterHTML(string HTMLStr) { if (!string.IsNullOrEmpty(HTMLStr)) return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>| ", ""); else return ""; }

写一个静态方法移除HTML标签

#region /// <summary> /// 移除HTML标签 /// </summary> /// <param>HTMLStr</param> public static string ParseTags(string HTMLStr) { return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", ""); } #endregion

取出文本中的图片地址

#region /// <summary> /// 取出文本中的图片地址 /// </summary> /// <param>HTMLStr</param> public static string GetImgUrl(string HTMLStr) { string str = string.Empty; string sPattern = @"^<img\s+[^>]*>"; Regex r = new Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>", RegexOptions.Compiled); Match m = r.Match(HTMLStr.ToLower()); if (m.Success) str = m.Result("${url}"); return str; } #endregion

提取HTML代码中文字的C#函数

/// <summary> /// 提取HTML代码中文字的C#函数 /// </summary> /// <param>包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> using System; using System.Text.RegularExpressions; public class StripHTMLTest { public static void Main() { string s = StripHTML( "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>"); Console.WriteLine(s); } public static string StripHTML(string strHtml) { string[]aryReg = { @"<script[^>]*?>.*?</script>", @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[" "'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @ "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @ "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);", @"&#(\d+);", @"-->", @"<!--.*\n" }; string[]aryRep = { "", "", "", "\"", "&", "<", ">", " ", "\xa1", //chr(161), "\xa2", //chr(162), "\xa3", //chr(163), "\xa9", //chr(169), "", "\r\n", "" }; string newReg = aryReg[0]; string strOutput = strHtml; for (int i = 0; i < aryReg.Length; i++) { Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase); strOutput = regex.Replace(strOutput, aryRep[i]); } strOutput.Replace("<", ""); strOutput.Replace(">", ""); strOutput.Replace("\r\n", ""); return strOutput; } }

TempContent 表示包含有html的字符串;
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent,"<[^>]+>","");至少一个
TempContent = System.Text.RegularExpressions.Regex.Replace(TempContent,"<[^>]*>","");任意个 

【免责声明】本站内容转载自互联网,其相关言论仅代表作者个人观点绝非权威,不代表本站立场。如您发现内容存在版权问题,请提交相关链接至邮箱:bqsm@foxmail.com,我们将及时予以处理。

网友评论
推荐文章