php结合curl实现多线程抓取
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
<?php /* curl 多线程抓取 */ /** * curl 多线程 * * @param array $array 并行网址 * @param int $timeout 超时时间 * @return array */ function Curl_http( $array , $timeout ){ $res = array (); $mh = curl_multi_init(); //创建多个curl语柄 $startime = getmicrotime(); foreach ( $array as $k => $url ){ $conn [ $k ]=curl_init( $url ); curl_setopt( $conn [ $k ], CURLOPT_TIMEOUT, $timeout ); //设置超时时间 curl_setopt( $conn [ $k ], CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)' ); curl_setopt( $conn [ $k ], CURLOPT_MAXREDIRS, 7); //HTTp定向级别 curl_setopt( $conn [ $k ], CURLOPT_HEADER, 0); //这里不要header,加块效率 curl_setopt( $conn [ $k ], CURLOPT_FOLLOWLOCATION, 1); // 302 redirect curl_setopt( $conn [ $k ],CURLOPT_RETURNTRANSFER,1); curl_multi_add_handle ( $mh , $conn [ $k ]); } //防止死循环耗死cpu 这段是根据网上的写法 do { $mrc = curl_multi_exec( $mh , $active ); //当无数据,active=true } while ( $mrc == CURLM_CALL_MULTI_PERFORM); //当正在接受数据时 while ( $active and $mrc == CURLM_OK) { //当无数据时或请求暂停时,active=true if (curl_multi_select( $mh ) != -1) { do { $mrc = curl_multi_exec( $mh , $active ); } while ( $mrc == CURLM_CALL_MULTI_PERFORM); } } foreach ( $array as $k => $url ) { curl_error( $conn [ $k ]); $res [ $k ]=curl_multi_getcontent( $conn [ $k ]); //获得返回信息 $header [ $k ]=curl_getinfo( $conn [ $k ]); //返回头信息 curl_close( $conn [ $k ]); //关闭语柄 curl_multi_remove_handle( $mh , $conn [ $k ]); //释放资源 } curl_multi_close( $mh ); $endtime = getmicrotime(); $diff_time = $endtime - $startime ; return array ( 'diff_time' => $diff_time , 'return' => $res , 'header' => $header ); } //计算当前时间 function getmicrotime() { list( $usec , $sec ) = explode ( " " ,microtime()); return ((float) $usec + (float) $sec ); } //测试一下,curl 三个网址 $array = array ( " http://www.weibo.com/ " , " http://www.renren.com/ " , " http://www.qq.com/ " ); $data = Curl_http( $array , '10' ); //调用 var_dump( $data ); //输出 //如果POST的数据大于1024字节,curl并不会直接就发起POST请求 //发送请求时,header中包含一个空的Expect。curl_setopt($ch, CURLOPT_HTTPHEADER, array("Expect:")); ?> |
我们再来看几个例子
(1)下面这段代码是实现抓取多个URL,然后将抓取的URL的页面代码写入指定的文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
$urls = array ( ' https://www.zzvips.com/ ' , ' http://www.google.com/ ' , ' http://www.example.com/ ' ); // 设置要抓取的页面URL $save_to = '/test.txt' ; // 把抓取的代码写入该文件 $st = fopen ( $save_to , "a" ); $mh = curl_multi_init(); foreach ( $urls as $i => $url ) { $conn [ $i ] = curl_init( $url ); curl_setopt( $conn [ $i ], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" ); curl_setopt( $conn [ $i ], CURLOPT_HEADER ,0); curl_setopt( $conn [ $i ], CURLOPT_CONNECTTIMEOUT,60); curl_setopt( $conn [ $i ], CURLOPT_FILE, $st ); // 将爬取的代码写入文件 curl_multi_add_handle ( $mh , $conn [ $i ]); } // 初始化 do { curl_multi_exec( $mh , $active ); } while ( $active ); // 执行 foreach ( $urls as $i => $url ) { curl_multi_remove_handle( $mh , $conn [ $i ]); curl_close( $conn [ $i ]); } // 结束清理 curl_multi_close( $mh ); fclose( $st ); |
(2)下面这段代码和上面差不多意思,只不过这个地方是将获得的代码先放入变量,然后再将获取到的内容写入指定的文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
$urls = array ( ' https://www.zzvips.com/ ' , ' http://www.google.com/ ' , ' http://www.example.com/ ' ); $save_to = '/test.txt' ; // 把抓取的代码写入该文件 $st = fopen ( $save_to , "a" ); $mh = curl_multi_init(); foreach ( $urls as $i => $url ) { $conn [ $i ] = curl_init( $url ); curl_setopt( $conn [ $i ], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" ); curl_setopt( $conn [ $i ], CURLOPT_HEADER ,0); curl_setopt( $conn [ $i ], CURLOPT_CONNECTTIMEOUT,60); curl_setopt( $conn [ $i ],CURLOPT_RETURNTRANSFER,true); // 不将爬取代码写到浏览器,而是转化为字符串 curl_multi_add_handle ( $mh , $conn [ $i ]); } do { curl_multi_exec( $mh , $active ); } while ( $active ); foreach ( $urls as $i => $url ) { $data = curl_multi_getcontent( $conn [ $i ]); // 获得爬取的代码字符串 fwrite( $st , $data ); // 将字符串写入文件 } // 获得数据变量,并写入文件 foreach ( $urls as $i => $url ) { curl_multi_remove_handle( $mh , $conn [ $i ]); curl_close( $conn [ $i ]); } curl_multi_close( $mh ); fclose( $st ); |
(3)下面这段代码实现的是利用 PHP 的 Curl Functions 实现并发多线程下载文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
$urls = array ( ' https://www.zzvips.com/5w.zip ' , ' https://www.zzvips.com/5w.zip ' , ' https://www.zzvips.com/5w.zip ' ); $save_to = './home/' ; $mh =curl_multi_init(); foreach ( $urls as $i => $url ){ $g = $save_to . basename ( $url ); if (! is_file ( $g )){ $conn [ $i ]=curl_init( $url ); $fp [ $i ]= fopen ( $g , "w" ); curl_setopt( $conn [ $i ],CURLOPT_USERAGENT, "Mozilla/4.0(compatible; MSIE 7.0; Windows NT 6.0)" ); curl_setopt( $conn [ $i ],CURLOPT_FILE, $fp [ $i ]); curl_setopt( $conn [ $i ],CURLOPT_HEADER ,0); curl_setopt( $conn [ $i ],CURLOPT_CONNECTTIMEOUT,60); curl_multi_add_handle( $mh , $conn [ $i ]); } } do { $n =curl_multi_exec( $mh , $active ); } while ( $active ); foreach ( $urls as $i => $url ){ curl_multi_remove_handle( $mh , $conn [ $i ]); curl_close( $conn [ $i ]); fclose( $fp [ $i ]); } curl_multi_close( $mh ); $urls = array ( ' https://www.zzvips.com/5w.zip ' , ' https://www.zzvips.com/5w.zip ' , ' https://www.zzvips.com/5w.zip ' ); $save_to = './home/' ; $mh =curl_multi_init(); foreach ( $urls as $i => $url ){ $g = $save_to . basename ( $url ); if (! is_file ( $g )){ $conn [ $i ]=curl_init( $url ); $fp [ $i ]= fopen ( $g , "w" ); curl_setopt( $conn [ $i ],CURLOPT_USERAGENT, "Mozilla/4.0(compatible; MSIE 7.0; Windows NT 6.0)" ); curl_setopt( $conn [ $i ],CURLOPT_FILE, $fp [ $i ]); curl_setopt( $conn [ $i ],CURLOPT_HEADER ,0); curl_setopt( $conn [ $i ],CURLOPT_CONNECTTIMEOUT,60); curl_multi_add_handle( $mh , $conn [ $i ]); } } do { $n =curl_multi_exec( $mh , $active ); } while ( $active ); foreach ( $urls as $i => $url ){ curl_multi_remove_handle( $mh , $conn [ $i ]); curl_close( $conn [ $i ]); fclose( $fp [ $i ]); } curl_multi_close( $mh ); |
以上所述就是本文的全部内容了,希望大家能够喜欢。