C#自寫的一個HTML解析類(相似XElement語法)。本站提示廣大學習愛好者:(C#自寫的一個HTML解析類(相似XElement語法))文章只能為提供參考,不一定能成為您想要的結果。以下是C#自寫的一個HTML解析類(相似XElement語法)正文
功效:
1、輕松獲得指元素HTML元素。
2、可以依據屬性標簽停止挑選
3、前往的都是Llist強類型無需轉換
用過XElement的都曉得 用來解析XML異常的便利,然則關於HTML的格局多樣化其實是沒方法兼容。
所以我就寫了這麼一個相似XElement的 XHTMLElement
用法:
string filePath = Server.MapPath("~/file/test.htm"); //獲得HTML代碼 string mailBody = FileHelper.FileToString(filePath); XHtmlElement xh = new XHtmlElement(mailBody); //獲得body的子集a標簽而且class="icon" var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList(); //獲得帶href的a元素 var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList(); foreach (var r in links) { Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出輸href } //獲得第一個img var img = xh.Descendants("img"); //獲得比來的第一個p元素和與他統一級的其它p元素 var ps = xh.Descendants("p");
代碼:
using System; using System.Collections.Generic; using System.Linq; using System.Web; using System.Text; using System.Text.RegularExpressions; namespace SyntacticSugar { /// <summary> /// ** 描寫:html解析類 /// ** 開創時光:2015-4-23 /// ** 修正時光:- /// ** 作者:sunkaixuan /// ** qq:610262374 迎接交換,配合進步 ,定名語法等寫的欠好的處所迎接年夜家的給出名貴建議 /// </summary> public class XHtmlElement { private string _html; public XHtmlElement(string html) { _html = html; } /// <summary> /// 獲得比來的雷同層級的HTML元素 /// </summary> /// <param name="elementName">等於null為一切元素</param> /// <returns></returns> public List<HtmlInfo> Descendants(string elementName = null) { if (_html == null) { throw new ArgumentNullException("html不克不及這空!"); } var allList = RootDescendants(_html); var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList(); if (reval == null || reval.Count == 0) { reval = GetDescendantsSource(allList, elementName); } return reval; } /// <summary> /// 獲得第一級元素 /// </summary> /// <param name="elementName"></param> /// <returns></returns> public List<HtmlInfo> RootDescendants(string html = null) { /* * 營業邏輯: * 1、獲得第一個html標簽一向找開頭標簽,假如在這個進程中碰到雷同的標簽掃尾標簽就要加1 * 2、第一個標簽取到後持續第一步操作,找第2個元素 。。第N個元素 */ if (html == null) html = _html; var firstTag = Regex.Match(html, "<.+?>"); List<string> eleList = new List<string>(); List<HtmlInfo> reval = new List<HtmlInfo>(); GetElementsStringList(html, ref eleList); foreach (var r in eleList) { HtmlInfo data = new HtmlInfo(); data.OldFullHtml = r; data.SameLeveHtml = html; data.TagName = Regex.Match(r, @"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)", RegexOptions.IgnoreCase).Value; data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=<)", RegexOptions.Singleline).Value; var eleBegin = Regex.Match(r, "<.+?>").Value; var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast<Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList(); data.Attributes = new Dictionary<string, string>(); if (attrList != null && attrList.Count > 0) { foreach (var a in attrList) { data.Attributes.Add(a.key, a.value); } } reval.Add(data); } return reval; } #region private private List<HtmlInfo> GetDescendantsSource(List<HtmlInfo> allList, string elementName) { foreach (var r in allList) { if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue; var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList(); if (childList == null || childList.Count == 0) { childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName); if (childList != null && childList.Count > 0) return childList; } else { return childList; } } return null; } private void GetElementsStringList(string html, ref List<string> eleList) { HtmlInfo info = new HtmlInfo(); info.TagName = Regex.Match(html, @"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)", RegexOptions.IgnoreCase).Value; string currentTagBeginReg = @"<\s{0,10}" + info.TagName + @".*?>";//獲得以後標簽元素開端標簽正則 string currentTagEndReg = @"\<\/" + info.TagName + @"\>";//獲得以後標簽元素掃尾標簽正則 if (string.IsNullOrEmpty(info.TagName)) return; string eleHtml = ""; //情形1 <a/> //情形2 <a></a> //情形3 <a> 毛病格局 //情形4endif if (Regex.IsMatch(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>"))//單標簽 { eleHtml = Regex.Match(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>").Value; } else if (!Regex.IsMatch(html, currentTagEndReg))//沒有掃尾 { if (Regex.IsMatch(html, @"\s{0,10}\<\!\-\-\[if")) { eleHtml = GetElementString(html, @"\s{0,10}\<\!\-\-\[if", @"\[endif\]\-\-\>", 1); } else { eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value; } } else { eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1); } try { eleList.Add(eleHtml); html = html.WordStr(eleHtml, ""); html = Regex.WordStr(html, @"<\!DOCTYPE.*?>", ""); if (!Regex.IsMatch(html, @"^\s*$")) { GetElementsStringList(html, ref eleList); } } catch (Exception ex) { throw new Exception("SORRY,您的HTML格局不克不及解析!!!"); } } private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i) { string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i); var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast<Match>().Select(c => c.Value).ToList(); var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast<Match>().Select(c => c.Value).ToList(); if (currentTagBeginMatches.Count == currentTagEndMatches.Count) { //兩個簽標元素相等 return newHtml; } return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i); } private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i) { return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value; } #endregion } public static class XHtmlElementExtendsion { /// <summary> /// 獲得比來的雷同層級的HTML元素 /// </summary> /// <param name="elementName">等於null為一切元素</param> /// <returns></returns> public static List<HtmlInfo> Descendants(this IEnumerable<HtmlInfo> htmlInfoList, string elementName = null) { var html = htmlInfoList.First().InnerHtml; XHtmlElement xhe = new XHtmlElement(html); return xhe.Descendants(elementName); } /// <summary> /// 獲得上級元素 /// </summary> /// <param name="elementName"></param> /// <returns></returns> public static List<HtmlInfo> ChildDescendants(this IEnumerable<HtmlInfo> htmlInfoList, string elementName = null) { var html = htmlInfoList.First().InnerHtml; XHtmlElement xhe = new XHtmlElement(html); return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList(); } /// <summary> /// 獲得父級 /// </summary> /// <param name="htmlInfoList"></param> /// <returns></returns> public static List<HtmlInfo> ParentDescendant(this IEnumerable<HtmlInfo> htmlInfoList,string fullHtml) { var saveLeveHtml = htmlInfoList.First().SameLeveHtml; string replaceGuid=Guid.NewGuid().ToString(); fullHtml = fullHtml.WordStr(saveLeveHtml,replaceGuid); var parentHtml = Regex.Match(fullHtml, @"<[^<]+?>[^<]*?" + replaceGuid + @".*?<\/.+?>").Value; parentHtml = parentHtml.WordStr(replaceGuid, saveLeveHtml); XHtmlElement xhe = new XHtmlElement(parentHtml); return xhe.RootDescendants(); } } /// <summary> /// html信息類 /// </summary> public class HtmlInfo { /// <summary> /// 元素名 /// </summary> public string TagName { get; set; } /// <summary> /// 元素屬性 /// </summary> public Dictionary<string, string> Attributes { get; set; } /// <summary> /// 元素外部html /// </summary> public string InnerHtml { get; set; } public string OldFullHtml { get; set; } public string SameLeveHtml { get; set; } /// <summary> /// 獲得元素的html /// </summary> /// <returns></returns> public string FullHtml { get { StringBuilder reval = new StringBuilder(); string attributesString = string.Empty; if (Attributes != null && Attributes.Count > 0) { attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}=\"{1}\"", c.Key, c.Value))); } reval.AppendFormat("<{0} {2}>{1}</{0}>", TagName, InnerHtml, attributesString); return reval.ToString(); } } } }
前台HTML:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title></title> </head> <body> <a id="1">我是1</a> <a id="2" class="icon">icon</a> <img /> </body> </html>