本文实例讲述了C#使用正则表达式抓取网站信息的方法。分享给大家供大家参考,具体如下:
这里以抓取京东商城商品详情为例。
1、创建JdRobber.cs程序类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
public class JdRobber { /// <summary> /// 判断是否京东链接 /// </summary> /// <param name="param"></param> /// <returns></returns> public bool ValidationUrl( string url) { bool result = false ; if (!String.IsNullOrEmpty(url)) { Regex regex = new Regex( @"^http://item.jd.com/\d+.html$" ); Match match = regex.Match(url); if (match.Success) { result = true ; } } return result; } /// <summary> /// 抓取京东信息 /// </summary> /// <param name="param"></param> /// <returns></returns> public void GetInfo( string url) { if (ValidationUrl(url)) { string htmlStr = WebHandler.GetHtmlStr(url, "Default" ); if (!String.IsNullOrEmpty(htmlStr)) { string pattern = "" ; //正则表达式 string sourceWebID = "" ; //商品关键ID string title = "" ; //标题 decimal price = 0; //价格 string picName = "" ; //图片 //提取商品关键ID pattern = @"http://item.jd.com/(?<Object>\d+).html" ; sourceWebID = WebHandler.GetRegexText(url, pattern); //提取标题 pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>" ; title = WebHandler.GetRegexText(htmlStr, pattern); //提取图片 int begin = htmlStr.IndexOf( "<div id=\"spec-n1\"" ); int end = htmlStr.IndexOf( "</div>" , begin + 1); if (begin > 0 && end > 0) { string subPicHtml = htmlStr.Substring(begin, end - begin); pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>" ; picName = WebHandler.GetRegexText(subPicHtml, pattern); } //提取价格 if (sourceWebID != "" ) { string priceUrl = @"http://p.3.cn/prices/get?skuid=J_" + sourceWebID + "&type=1" ; string priceJson = WebHandler.GetHtmlStr(priceUrl, "Default" ); pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\""" ; price = WebHandler.GetValidPrice(WebHandler.GetRegexText(priceJson, pattern)); } Console.WriteLine( "商品名称:{0}" , title); Console.WriteLine( "图片:{0}" , picName); Console.WriteLine( "价格:{0}" , price); } } } } |
2、创建WebHandler.cs公共方法类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
/// <summary> /// 公共方法类 /// </summary> public class WebHandler { /// <summary> /// 获取网页的HTML码 /// </summary> /// <param name="url">链接地址</param> /// <param name="encoding">编码类型</param> /// <returns></returns> public static string GetHtmlStr( string url, string encoding) { string htmlStr = "" ; try { if (!String.IsNullOrEmpty(url)) { WebRequest request = WebRequest.Create(url); //实例化WebRequest对象 WebResponse response = request.GetResponse(); //创建WebResponse对象 Stream datastream = response.GetResponseStream(); //创建流对象 Encoding ec = Encoding.Default; if (encoding == "UTF8" ) { ec = Encoding.UTF8; } else if (encoding == "Default" ) { ec = Encoding.Default; } StreamReader reader = new StreamReader(datastream, ec); htmlStr = reader.ReadToEnd(); //读取数据 reader.Close(); datastream.Close(); response.Close(); } } catch { } return htmlStr; } /// <summary> /// 获取正则表达式中的关键字 /// </summary> /// <param name="input">文本</param> /// <param name="pattern">表达式</param> /// <returns></returns> public static string GetRegexText( string input, string pattern) { string result = "" ; if (!String.IsNullOrEmpty(input) && !String.IsNullOrEmpty(pattern)) { Regex regex = new Regex(pattern, RegexOptions.IgnoreCase); Match match = regex.Match(input); if (match.Success) { result = match.Groups[ "Object" ].Value; } } return result; } /// <summary> /// 返回有效价格 /// </summary> /// <param name="strPrice"></param> /// <returns></returns> public static decimal GetValidPrice( string strPrice) { decimal price = 0; try { if (!String.IsNullOrEmpty(strPrice)) { Regex regex = new Regex( @"^\d+(\.\d{1,2})?$" , RegexOptions.IgnoreCase); Match match = regex.Match(strPrice); if (match.Success) { price = decimal .Parse(strPrice); } } } catch { } return price; } } |
希望本文所述对大家C#程序设计有所帮助。