废话不多说,直接上代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ /* * File: main.cpp * Author: yangchao * */ #include <iostream> #include <string> #include <netdb.h> #include <string.h> #include <stdlib.h> using namespace std; void parseHostAndPagePath( const string url,string &hostUrl,string &pagePath){ hostUrl=url; pagePath= "/" ; int pos=hostUrl.find( "http://" ); if (-1!=pos) hostUrl=hostUrl.replace(pos,7, "" ); pos=hostUrl.find( "https://" ); if (-1!=pos) hostUrl=hostUrl.replace(pos,8, "" ); pos=hostUrl.find( "/" ); if (-1!=pos) { pagePath=hostUrl.substr(pos); hostUrl=hostUrl.substr(0,pos); } } string getPageContent( const string url){ struct hostent *host; string hostUrl,pagePath; parseHostAndPagePath(url,hostUrl,pagePath); if (0==(host=gethostbyname(hostUrl.c_str()))) { cout<< "gethostbyname error\n" <<endl; exit (1); } struct sockaddr_in pin; int port=80; bzero(&pin, sizeof (pin)); pin.sin_family=AF_INET; pin.sin_port=htons(port); pin.sin_addr.s_addr=(( struct in_addr*)(host->h_addr))->s_addr; int isock; if ((isock=socket(AF_INET,SOCK_STREAM,0))==-1) { cout<< "open socket error\n" <<endl; exit (1); } string requestHeader; requestHeader= "GET " +pagePath+ " HTTP/1.1\r\n" ; requestHeader+= "Host: " +hostUrl+ "\r\n" ; requestHeader+= "Accept: */*\r\n" ; requestHeader+= "User-Agent: Mozilla/4.0(compatible)\r\n" ; requestHeader+= "connection:Keep-Alive\r\n" ; requestHeader+= "\r\n" ; if (connect(isock,( const sockaddr*)&pin, sizeof (pin))==-1){ cout<< "connect error\n" <<endl; exit (1); } if (send(isock,requestHeader.c_str(),requestHeader.size(),0)==-1){ cout<< "send error\n" <<endl; exit (1); } struct timeval timeout={1,0}; setsockopt(isock,SOL_SOCKET,SO_RCVTIMEO,( char *)&timeout, sizeof ( struct timeval)); char c; bool flag= true ; while (recv(isock,&c,1,0)>0){ if ( '\r' ==c){ continue ; } else if ( '\n' ==c){ if ( false ==flag) break ; flag= false ; } else { flag= true ; } } int len,BUFFER_SIZE=512; char buffer[BUFFER_SIZE]; string pageContent= "" ; while ((len=recv(isock,buffer,BUFFER_SIZE-1,0))>0){ buffer[len]= '\0' ; pageContent+=buffer; } return pageContent; } int main( int argc, char ** argv) { cout<<getPageContent( "http://www.hao123.com" )<<endl; return 0; } |
以上这篇linux c++模拟简易网络爬虫实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。