1、推荐的一种方法:php判断搜索引擎蜘蛛爬虫还是人为访问代码,摘自Discuz x3.2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
<?php function checkrobot( $useragent = '' ){ static $kw_spiders = array ( 'bot' , 'crawl' , 'spider' , 'slurp' , 'sohu-search' , 'lycos' , 'robozilla' ); static $kw_browsers = array ( 'msie' , 'netscape' , 'opera' , 'konqueror' , 'mozilla' ); $useragent = strtolower ( empty ( $useragent ) ? $_SERVER [ 'HTTP_USER_AGENT' ] : $useragent ); if ( strpos ( $useragent , 'http://' ) === false && dstrpos( $useragent , $kw_browsers )) return false; if (dstrpos( $useragent , $kw_spiders )) return true; return false; } function dstrpos( $string , $arr , $returnvalue = false) { if ( empty ( $string )) return false; foreach (( array ) $arr as $v ) { if ( strpos ( $string , $v ) !== false) { $return = $returnvalue ? $v : true; return $return ; } } return false; } if (checkrobot()){ echo '机器人爬虫' ; } else { echo '人' ; } ?> |
实际应用中可以这样判断,直接不是搜索引擎才执行操作
1
2
3
4
5
|
<?php if (!checkrobot()){ //do something } ?> |
2、第二种方法:
使用PHP实现蜘蛛访问日志统计
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
$useragent = addslashes ( strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ])); if ( strpos ( $useragent , 'googlebot' )!== false){ $bot = 'Google' ;} elseif ( strpos ( $useragent , 'mediapartners-google' ) !== false){ $bot = 'Google Adsense' ;} elseif ( strpos ( $useragent , 'baiduspider' ) !== false){ $bot = 'Baidu' ;} elseif ( strpos ( $useragent , 'sogou spider' ) !== false){ $bot = 'Sogou' ;} elseif ( strpos ( $useragent , 'sogou web' ) !== false){ $bot = 'Sogou web' ;} elseif ( strpos ( $useragent , 'sosospider' ) !== false){ $bot = 'SOSO' ;} elseif ( strpos ( $useragent , '360spider' ) !== false){ $bot = '360Spider' ;} elseif ( strpos ( $useragent , 'yahoo' ) !== false){ $bot = 'Yahoo' ;} elseif ( strpos ( $useragent , 'msn' ) !== false){ $bot = 'MSN' ;} elseif ( strpos ( $useragent , 'msnbot' ) !== false){ $bot = 'msnbot' ;} elseif ( strpos ( $useragent , 'sohu' ) !== false){ $bot = 'Sohu' ;} elseif ( strpos ( $useragent , 'yodaoBot' ) !== false){ $bot = 'Yodao' ;} elseif ( strpos ( $useragent , 'twiceler' ) !== false){ $bot = 'Twiceler' ;} elseif ( strpos ( $useragent , 'ia_archiver' ) !== false){ $bot = 'Alexa_' ;} elseif ( strpos ( $useragent , 'iaarchiver' ) !== false){ $bot = 'Alexa' ;} elseif ( strpos ( $useragent , 'slurp' ) !== false){ $bot = '雅虎' ;} elseif ( strpos ( $useragent , 'bot' ) !== false){ $bot = '其它蜘蛛' ;} if (isset( $bot )){ $fp = @ fopen ( 'bot.txt' , 'a' ); fwrite( $fp , date ( 'Y-m-d H:i:s' ). " " . $_SERVER [ "REMOTE_ADDR" ]. " " . $bot . " " . 'http://' . $_SERVER [ 'SERVER_NAME' ]. $_SERVER [ "REQUEST_URI" ]. " " ); fclose( $fp ); } |
第三种方法:
我们可以通过HTTP_USER_AGENT来判断是否是蜘蛛,搜索引擎的蜘蛛都有自己的独特标志,下面列取了一部分。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
function is_crawler() { $userAgent = strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]); $spiders = array ( 'Googlebot' , // Google 爬虫 'Baiduspider' , // 百度爬虫 'Yahoo! Slurp' , // 雅虎爬虫 'YodaoBot' , // 有道爬虫 'msnbot' // Bing爬虫 // 更多爬虫关键字 ); foreach ( $spiders as $spider ) { $spider = strtolower ( $spider ); if ( strpos ( $userAgent , $spider ) !== false) { return true; } } return false; } |
下面的php代码附带了更多的蜘蛛标识
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
function isCrawler() { echo $agent = strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]); if (! empty ( $agent )) { $spiderSite = array ( "TencentTraveler" , "Baiduspider+" , "BaiduGame" , "Googlebot" , "msnbot" , "Sosospider+" , "Sogou web spider" , "ia_archiver" , "Yahoo! Slurp" , "YoudaoBot" , "Yahoo Slurp" , "MSNBot" , "Java (Often spam bot)" , "BaiDuSpider" , "Voila" , "Yandex bot" , "BSpider" , "twiceler" , "Sogou Spider" , "Speedy Spider" , "Google AdSense" , "Heritrix" , "Python-urllib" , "Alexa (IA Archiver)" , "Ask" , "Exabot" , "Custo" , "OutfoxBot/YodaoBot" , "yacy" , "SurveyBot" , "legs" , "lwp-trivial" , "Nutch" , "StackRambler" , "The web archive (IA Archiver)" , "Perl tool" , "MJ12bot" , "Netcraft" , "MSIECrawler" , "WGet tools" , "larbin" , "Fish search" , ); foreach ( $spiderSite as $val ) { $str = strtolower ( $val ); if ( strpos ( $agent , $str ) !== false) { return true; } } } else { return false; } } if (isCrawler()){ echo "你好蜘蛛精!" ; } else { echo "你不是蜘蛛精啊!" ; } |
第四种方法:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
<?php $flag = false; $tmp = $_SERVER [ 'HTTP_USER_AGENT' ]; if ( strpos ( $tmp , 'Googlebot' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'Baiduspider' ) >0){ $flag = true; } else if ( strpos ( $tmp , 'Yahoo! Slurp' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'msnbot' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'Sosospider' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'YodaoBot' ) !== false || strpos ( $tmp , 'OutfoxBot' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'Sogou web spider' ) !== false || strpos ( $tmp , 'Sogou Orion spider' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'fast-webcrawler' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'Gaisbot' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'ia_archiver' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'altavista' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'lycos_spider' ) !== false){ $flag = true; } else if ( strpos ( $tmp , 'Inktomi slurp' ) !== false){ $flag = true; } if ( $flag == false){ // 自动转到http://www.zzvips.com 对应的网页 // $_SERVER['REQUEST_URI'] 为域名后面的路径 // 或换成header("Location: http://www.zzvips.com/tags.html"); exit (); } ?> |