本文实例讲述了PHP实现简单爬虫的方法。分享给大家供大家参考。具体如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
<?php /** * 爬虫程序 -- 原型 * * 从给定的url获取html内容 * * @param string $url * @return string */ function _getUrlContent( $url ) { $handle = fopen ( $url , "r" ); if ( $handle ) { $content = stream_get_contents( $handle , 1024 * 1024); return $content ; } else { return false; } } /** * 从html内容中筛选链接 * * @param string $web_content * @return array */ function _filterUrl( $web_content ) { $reg_tag_a = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/' ; $result = preg_match_all( $reg_tag_a , $web_content , $match_result ); if ( $result ) { return $match_result [1]; } } /** * 修正相对路径 * * @param string $base_url * @param array $url_list * @return array */ function _reviseUrl( $base_url , $url_list ) { $url_info = parse_url ( $base_url ); $base_url = $url_info [ "scheme" ] . '://' ; if ( $url_info [ "user" ] && $url_info [ "pass" ]) { $base_url .= $url_info [ "user" ] . ":" . $url_info [ "pass" ] . "@" ; } $base_url .= $url_info [ "host" ]; if ( $url_info [ "port" ]) { $base_url .= ":" . $url_info [ "port" ]; } $base_url .= $url_info [ "path" ]; print_r( $base_url ); if ( is_array ( $url_list )) { foreach ( $url_list as $url_item ) { if (preg_match( '/^http/' , $url_item )) { // 已经是完整的url $result [] = $url_item ; } else { // 不完整的url $real_url = $base_url . '/' . $url_item ; $result [] = $real_url ; } } return $result ; } else { return ; } } /** * 爬虫 * * @param string $url * @return array */ function crawler( $url ) { $content = _getUrlContent( $url ); if ( $content ) { $url_list = _reviseUrl( $url , _filterUrl( $content )); if ( $url_list ) { return $url_list ; } else { return ; } } else { return ; } } /** * 测试用主程序 */ function main() { $current_url = "http://hao123.com/" ; //初始url $fp_puts = fopen ( "url.txt" , "ab" ); //记录url列表 $fp_gets = fopen ( "url.txt" , "r" ); //保存url列表 do { $result_url_arr = crawler( $current_url ); if ( $result_url_arr ) { foreach ( $result_url_arr as $url ) { fputs ( $fp_puts , $url . "\r\n" ); } } } while ( $current_url = fgets ( $fp_gets , 1024)); //不断获得url } main(); ?> |
希望本文所述对大家的php程序设计有所帮助。