呵呵,由于正则不熟,所以另谋出路——利用XML去解析html。
要想将抓取到的数据(直接抓取到的是byte[]) 转为XML文档(即XMLDocument对象),有两个要点:
一、判断编码(http头 charset 在某些网站上是不准确的)
我利用的是 去判断编码的,效果还不错: 。
二、将html转为XHTML
我利用的是 : SgmlReaderDll.dll ,微软提供的,虽然不是100%的准确,但是足以满足 轻量级的商业需求 。
核心代码如下:
public class XHtmlTools { private const string RegBody = @" (?[\s\S]*)"; ////// 获取xml文档 /// /// ///public XmlDocument GetXmlDocument(byte[] html) { StringBuilder XMLHEAD = new StringBuilder(); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append(" "); XMLHEAD.Append("]>"); if (html == null) return null; string xml = Convert(html); if (string.IsNullOrEmpty(xml)) return null; try { XmlDocument xmlDoc = new XmlDocument(); xmlDoc.XmlResolver = null; xmlDoc.LoadXml(string.Format("{0}{1}", XMLHEAD.ToString(), xml)); return xmlDoc; } catch (XmlException) { return null; } } /// /// 将html转为xml /// /// ///public string Convert(byte[] html) { string xml = string.Empty; try { using (HtmlReader reader = new HtmlReader(GetString(html))) { StringBuilder sb = new StringBuilder(); using (HtmlWriter writer = new HtmlWriter(sb)) { while (!reader.EOF) { writer.WriteNode(reader, true); } } xml = sb.ToString(); } } catch (Exception) { } Match match = Regex.Match(xml, RegBody, RegexOptions.IgnoreCase); if (match.Success) { xml = match.Value; } if (string.IsNullOrEmpty(xml)) { xml = ""; } return xml; } /// /// 解析编码并获得字符串 /// /// ///public string GetString(byte[] buffer) { string result = string.Empty; if (buffer == null) return result; using (MemoryStream msTemp = new MemoryStream(buffer)) { if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !det.IsDone()) { det.HandleData(DetectBuff, 0, DetectBuff.Length); } det.DataEnd(); if (det.GetDetectedCharset() != null) { try { result = System.Text.Encoding.GetEncoding(det.GetDetectedCharset()).GetString(buffer); } catch (ArgumentException) { } } } } return result; } } public class HtmlReader : Sgml.SgmlReader { public HtmlReader(TextReader reader) : base() { base.InputStream = reader; base.DocType = "HTML"; } public HtmlReader(string content) : base() { base.InputStream = new StringReader(System.Web.HttpUtility.HtmlDecode(content)); base.DocType = "HTML"; } public override bool Read() { bool status = false; try { status = base.Read(); if (status) { if (base.NodeType == XmlNodeType.Element && (string.Compare(base.Name, "head", true) == 0 || string.Compare(base.Name, "script", true) == 0)) { base.Skip(); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } return status; } } public class HtmlWriter : XmlTextWriter { private char[] chArrFilter = new char[] { '\'', '=', '?', '\"', '.', ';', ':', ')', '(', ' ', ' ' }; public HtmlWriter(TextWriter writer) : base(writer) { } public HtmlWriter(StringBuilder builder) : base(new StringWriter(builder)) { } public HtmlWriter(Stream stream, Encoding enc) : base(stream, enc) { } public override void WriteCData(string text) { // base.WriteCData(text); } public override void WriteComment(string text) { } public override void WriteWhitespace(string ws) { if (ws.IndexOf("\r\n") > -1 || ws.IndexOf("\t") > -1) { return; } if (ws != " ") { // 处理空白字符 base.WriteWhitespace(ws); } } public override void WriteStartElement(string prefix, string localName, string ns) { if (localName != "") { int index = localName.LastIndexOf(':'); if (index > -1) { // 防止带有前缀 localName = localName.Substring(index + 1); } localName = string.Join("", localName.Split(chArrFilter)).ToLower(); base.WriteStartElement("", localName, ""); } } public override void WriteAttributes(XmlReader reader, bool defattr) { if ((reader.NodeType == XmlNodeType.Element) || (reader.NodeType == XmlNodeType.XmlDeclaration)) { if (reader.MoveToFirstAttribute()) { this.WriteAttributes(reader, defattr); reader.MoveToElement(); } } else if (reader.NodeType == XmlNodeType.Attribute) { string localName = ""; string value = ""; do { localName = reader.LocalName.ToLower(); // 单过滤 if (localName != "xml:space" && (localName.LastIndexOf(':') > -1 || localName.StartsWith("xml"))) { // 防止带有前缀 continue; } localName = string.Join("", localName.Split(chArrFilter)); if (localName == "") { continue; } this.WriteStartAttribute("", localName, ""); while (reader.ReadAttributeValue()) { // if (reader.NodeType == XmlNodeType.EntityReference) // { // this.WriteEntityRef(reader.Name); // continue; // } value = reader.Value; if (value == "") { continue; } this.WriteString(value); // this.WriteRawString(reader.Value); // this.WriteAttributeString(localName, reader.Value); } this.WriteEndAttribute(); // =========================================== //string attributeLocalName = reader.LocalName; //while (reader.ReadAttributeValue()) //{ // string str = reader.Name; //} //string strValue = reader.Value; //attributeLocalName = reader.Name; //// 过滤无效的属性 //if (attributeLocalName == "" || strValue == "") //{ // attributeLocalName = attributeLocalName.TrimStart(new char[] { '\'', '=', '?', '\"', '.' }).ToLower(); // this.WriteAttributeString(attributeLocalName, strValue); //} } while (reader.MoveToNextAttribute()); } } }
上述源码及DLL :
下面再说一下解析XML,我利用的XPath:
XPath 和 jQuery所支持的选择器有一定的相似之处,借助jQuery所支持的选择器去理解XPath会更容易一些。
续:
由 分享的 ,开源的力量很强大!
HtmlAgilityPack 里的部分类 的元属性截图
支持多个 .NET 版本
HtmlAgilityPack地址: