上次做了一个帮公司妹子做了爬虫,不是很精致,这次公司项目里要用到,于是有做了一番修改,功能添加了网址图片采集,下载,线程处理界面网址图片下载等。
说说思路:首相获取初始网址的所有内容 在初始网址采集图片 去初始网址采集链接 把采集到的链接放入队列 继续采集图片,然后继续采集链接,无限循环
还是上图片大家看一下,
处理网页内容抓取跟网页网址爬取都做了改进,下面还是大家来看看代码,有不足之处,还请之处!
网页内容抓取htmlcoderequest,
网页网址爬取gethttplinks,用正则去筛选html中的links
图片抓取gethtmlimageurllist,用正则去筛选html中的img
都写进了一个封装类里面 httphelper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
/// <summary> /// 取得html中所有图片的 url。 /// </summary> /// <param name="shtmltext">html代码</param> /// <returns>图片的url列表</returns> public static string htmlcoderequest( string url) { if ( string .isnullorempty(url)) { return "" ; } try { //创建一个请求 httpwebrequest httprequst = (httpwebrequest)webrequest.create(url); //不建立持久性链接 httprequst.keepalive = true ; //设置请求的方法 httprequst.method = "get" ; //设置标头值 httprequst.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ; httprequst.accept = "*/*" ; httprequst.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" ); httprequst.servicepoint.expect100continue = false ; httprequst.timeout = 5000; httprequst.allowautoredirect = true ; //是否允许302 servicepointmanager.defaultconnectionlimit = 30; //获取响应 httpwebresponse webres = (httpwebresponse)httprequst.getresponse(); //获取响应的文本流 string content = string .empty; using (system.io.stream stream = webres.getresponsestream()) { using (system.io.streamreader reader = new streamreader(stream, system.text.encoding.getencoding( "utf-8" ))) { content = reader.readtoend(); } } //取消请求 httprequst.abort(); //返回数据内容 return content; } catch (exception) { return "" ; } } /// <summary> /// 提取页面链接 /// </summary> /// <param name="html"></param> /// <returns></returns> public static list< string > gethtmlimageurllist( string url) { string html = httphelper.htmlcoderequest(url); if ( string .isnullorempty(html)) { return new list< string >(); } // 定义正则表达式用来匹配 img 标签 regex regimg = new regex( @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgurl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>" , regexoptions.ignorecase); // 搜索匹配的字符串 matchcollection matches = regimg.matches(html); list< string > surllist = new list< string >(); // 取得匹配项列表 foreach (match match in matches) surllist.add(match.groups[ "imgurl" ].value); return surllist; } /// <summary> /// 提取页面链接 /// </summary> /// <param name="html"></param> /// <returns></returns> public static list< string > gethttplinks( string url) { //获取网址内容 string html = httphelper.htmlcoderequest(url); if ( string .isnullorempty(html)) { return new list< string >(); } //匹配http链接 const string pattern2 = @"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?" ; regex r2 = new regex(pattern2, regexoptions.ignorecase); //获得匹配结果 matchcollection m2 = r2.matches(html); list< string > links = new list< string >(); foreach (match url2 in m2) { if (stringhelper.checkurlislegal(url2.tostring()) || !stringhelper.ispureurl(url2.tostring()) || links.contains(url2.tostring())) continue ; links.add(url2.tostring()); } //匹配href里面的链接 const string pattern = @"(?i)<a\s[^>]*?href=(['""]?)(?!javascript|__dopostback)(?<url>[^'""\s*#<>]+)[^>]*>" ; ; regex r = new regex(pattern, regexoptions.ignorecase); //获得匹配结果 matchcollection m = r.matches(html); foreach (match url1 in m) { string href1 = url1.groups[ "url" ].value; if (!href1.contains( "http" )) { href1 = global.weburl + href1; } if (!stringhelper.ispureurl(href1) || links.contains(href1)) continue ; links.add(href1); } return links; } |
这边下载图片有个任务条数限制,限制是200条。如果超过的话线程等待5秒,这里下载图片是异步调用的委托
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
public string downloadimg( string url) { if (! string .isnullorempty(url)) { try { if (!url.contains( "http" )) { url = global.weburl + url; } httpwebrequest request = (httpwebrequest)webrequest.create(url); request.timeout = 2000; request.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ; //是否允许302 request.allowautoredirect = true ; webresponse response = request.getresponse(); stream reader = response.getresponsestream(); //文件名 string afirstname = guid.newguid().tostring(); //扩展名 string alastname = url.substring(url.lastindexof( "." ) + 1, (url.length - url.lastindexof( "." ) - 1)); filestream writer = new filestream(global.floderurl + afirstname + "." + alastname, filemode.openorcreate, fileaccess.write); byte [] buff = new byte [512]; //实际读取的字节数 int c = 0; while ((c = reader.read(buff, 0, buff.length)) > 0) { writer.write(buff, 0, c); } writer.close(); writer.dispose(); reader.close(); reader.dispose(); response.close(); return (afirstname + "." + alastname); } catch (exception) { return "错误:地址" + url; } } return "错误:地址为空" ; } |
话不多说,更多的需要大家自己去改进咯!欢迎读者来与楼主进行交流。