公司编辑妹子需要爬取网页内容,叫我帮忙做了一简单的爬取工具
这是爬取网页内容,像是这对大家来说都是不难得,但是在这里有一些小改动,代码献上,大家参考
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
private string gethttpwebrequest( string url) { httpwebresponse result; string strhtml = string .empty; try { uri uri = new uri(url); webrequest webreq = webrequest.create(uri); webresponse webres = webreq.getresponse(); httpwebrequest myreq = (httpwebrequest)webreq; myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ; myreq.accept = "*/*" ; myreq.keepalive = true ; myreq.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" ); result = (httpwebresponse)myreq.getresponse(); stream recevicestream = result.getresponsestream(); streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding( "utf-8" )); strhtml = readerofstream.readtoend(); readerofstream.close(); recevicestream.close(); result.close(); } catch { uri uri = new uri(url); webrequest webreq = webrequest.create(uri); httpwebrequest myreq = (httpwebrequest)webreq; myreq.useragent = "user-agent:mozilla/4.0 (compatible; msie 6.0; windows nt 5.2; .net clr 1.0.3705" ; myreq.accept = "*/*" ; myreq.keepalive = true ; myreq.headers.add( "accept-language" , "zh-cn,en-us;q=0.5" ); //result = (httpwebresponse)myreq.getresponse(); try { result = (httpwebresponse)myreq.getresponse(); } catch (webexception ex) { result = (httpwebresponse)ex.response; } stream recevicestream = result.getresponsestream(); streamreader readerofstream = new streamreader(recevicestream, system.text.encoding.getencoding( "gb2312" )); strhtml = readerofstream.readtoend(); readerofstream.close(); recevicestream.close(); result.close(); } return strhtml; } |
这是根据url爬取网页远吗,有一些小改动,很多网页有不同的编码格式,甚至有些网站做了反爬取的防范,这个方法经过能够改动也能爬去
以下是爬取网页所有的网址链接
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
/// <summary> /// 提取html代码中的网址 /// </summary> /// <param name="htmlcode"></param> /// <returns></returns> private static list< string > gethyperlinks( string htmlcode, string url) { arraylist al = new arraylist(); bool isgenxin = false ; stringbuilder weburlsb = new stringbuilder(); //sql stringbuilder linksb = new stringbuilder(); //展示数据 list< string > weburllistzx = new list< string >(); //新增 list< string > weburllist = new list< string >(); //旧的 string productioncontent = htmlcode; regex reg = new regex( @"http(s)?://([\w-]+\.)+[\w-]+/?" ); string wangzhanyuming = reg.match(url, 0).value; matchcollection mc = regex.matches(productioncontent.replace( "href=\"/" , "href=\"" + wangzhanyuming).replace( "href='/" , "href='" + wangzhanyuming).replace( "href=/" , "href=" + wangzhanyuming).replace( "href=\"./" , "href=\"" + wangzhanyuming), @"<[aa][^>]* href=[^>]*>" , regexoptions.singleline); int index = 1; foreach (match m in mc) { matchcollection mc1 = regex.matches(m.value, @"[a-za-z]+://[^\s]*" , regexoptions.singleline); if (mc1.count > 0) { foreach (match m1 in mc1) { string linkurlstr = string .empty; linkurlstr = m1.value.replace( "\"" , "" ).replace( "'" , "" ).replace( ">" , "" ).replace( ";" , "" ); weburlsb.append( "$-$" ); weburlsb.append(linkurlstr); weburlsb.append( "$_$" ); if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr)) { isgenxin = true ; weburllistzx.add(linkurlstr); linksb.appendformat( "{0}<br/>" , linkurlstr); } } } else { if (m.value.indexof( "javascript" ) == -1) { string amstr = string .empty; string wangzhanxiangduilujin = string .empty; wangzhanxiangduilujin = url.substring(0, url.lastindexof( "/" ) + 1); amstr = m.value.replace( "href=\"" , "href=\"" + wangzhanxiangduilujin).replace( "href='" , "href='" + wangzhanxiangduilujin); matchcollection mc11 = regex.matches(amstr, @"[a-za-z]+://[^\s]*" , regexoptions.singleline); foreach (match m1 in mc11) { string linkurlstr = string .empty; linkurlstr = m1.value.replace( "\"" , "" ).replace( "'" , "" ).replace( ">" , "" ).replace( ";" , "" ); weburlsb.append( "$-$" ); weburlsb.append(linkurlstr); weburlsb.append( "$_$" ); if (!weburllist.contains(linkurlstr) && !weburllistzx.contains(linkurlstr)) { isgenxin = true ; weburllistzx.add(linkurlstr); linksb.appendformat( "{0}<br/>" , linkurlstr); } } } } index++; } return weburllistzx; } |
这块的技术其实就是简单的使用了正则去匹配!接下来献上获取标题,以及存储到xml文件的方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
/// <summary> /// // 把网址写入xml文件 /// </summary> /// <param name="strurl"></param> /// <param name="alhyperlinks"></param> private static void writetoxml( string strurl, list< string > alhyperlinks) { xmltextwriter writer = new xmltextwriter( @"d:\hyperlinks.xml" , encoding.utf8); writer.formatting = formatting.indented; writer.writestartdocument( false ); writer.writedoctype( "hyperlinks" , null , "urls.dtd" , null ); writer.writecomment( "提取自" + strurl + "的超链接" ); writer.writestartelement( "hyperlinks" ); writer.writestartelement( "hyperlinks" , null ); writer.writeattributestring( "datetime" , datetime.now.tostring()); foreach ( string str in alhyperlinks) { string title = getdomain(str); string body = str; writer.writeelementstring(title, null , body); } writer.writeendelement(); writer.writeendelement(); writer.flush(); writer.close(); } /// <summary> /// 获取网址的域名后缀 /// </summary> /// <param name="strurl"></param> /// <returns></returns> private static string getdomain( string strurl) { string retval; string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)" ; regex r = new regex(strregex, regexoptions.ignorecase); match m = r.match(strurl); retval = m.tostring(); strregex = @"\.|/$" ; retval = regex.replace(retval, strregex, "" ).tostring(); if (retval == "" ) retval = "other" ; return retval; } /// <summary> /// 获取标题 /// </summary> /// <param name="html"></param> /// <returns></returns> private static string gettitle( string html) { string titlefilter = @"<title>[\s\s]*?</title>" ; string h1filter = @"<h1.*?>.*?</h1>" ; string clearfilter = @"<.*?>" ; string title = "" ; match match = regex.match(html, titlefilter, regexoptions.ignorecase); if (match.success) { title = regex.replace(match.groups[0].value, clearfilter, "" ); } // 正文的标题一般在h1中,比title中的标题更干净 match = regex.match(html, h1filter, regexoptions.ignorecase); if (match.success) { string h1 = regex.replace(match.groups[0].value, clearfilter, "" ); if (! string .isnullorempty(h1) && title.startswith(h1)) { title = h1; } } return title; } |
这就是所用的全部方法,还是有很多需要改进之处!大家如果有发现不足之处还请指出,谢谢!
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。