PHP屏蔽蜘蛛访问代码代码:
常用搜索引擎名与 HTTP_USER_AGENT对应值
百度baiduspider
谷歌googlebot
搜狗sogou
腾讯SOSOsosospider
雅虎slurp
有道youdaobot
Bingbingbot
MSNmsnbot
Alexais_archiver
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
function is_crawler() { $userAgent = strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]); $spiders = array ( 'Googlebot' , // Google 爬虫 'Baiduspider' , // 百度爬虫 'Yahoo! Slurp' , // 雅虎爬虫 'YodaoBot' , // 有道爬虫 'msnbot' // Bing爬虫 // 更多爬虫关键字 ); foreach ( $spiders as $spider ) { $spider = strtolower ( $spider ); if ( strpos ( $userAgent , $spider ) !== false) { return true; } } return false; } |
下面的php代码附带了更多的蜘蛛标识
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
function isCrawler() { echo $agent = strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ]); if (! empty ( $agent )) { $spiderSite = array ( "TencentTraveler" , "Baiduspider+" , "BaiduGame" , "Googlebot" , "msnbot" , "Sosospider+" , "Sogou web spider" , "ia_archiver" , "Yahoo! Slurp" , "YoudaoBot" , "Yahoo Slurp" , "MSNBot" , "Java (Often spam bot)" , "BaiDuSpider" , "Voila" , "Yandex bot" , "BSpider" , "twiceler" , "Sogou Spider" , "Speedy Spider" , "Google AdSense" , "Heritrix" , "Python-urllib" , "Alexa (IA Archiver)" , "Ask" , "Exabot" , "Custo" , "OutfoxBot/YodaoBot" , "yacy" , "SurveyBot" , "legs" , "lwp-trivial" , "Nutch" , "StackRambler" , "The web archive (IA Archiver)" , "Perl tool" , "MJ12bot" , "Netcraft" , "MSIECrawler" , "WGet tools" , "larbin" , "Fish search" , ); foreach ( $spiderSite as $val ) { $str = strtolower ( $val ); if ( strpos ( $agent , $str ) !== false) { return true; } } } else { return false; } } if (isCrawler()){ echo "你好蜘蛛精!" ; } else { echo "你不是蜘蛛精啊!" ; } |
使用PHP实现蜘蛛访问日志统计
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
$useragent = addslashes ( strtolower ( $_SERVER [ 'HTTP_USER_AGENT' ])); if ( strpos ( $useragent , 'googlebot' )!== false){ $bot = 'Google' ;} elseif ( strpos ( $useragent , 'mediapartners-google' ) !== false){ $bot = 'Google Adsense' ;} elseif ( strpos ( $useragent , 'baiduspider' ) !== false){ $bot = 'Baidu' ;} elseif ( strpos ( $useragent , 'sogou spider' ) !== false){ $bot = 'Sogou' ;} elseif ( strpos ( $useragent , 'sogou web' ) !== false){ $bot = 'Sogou web' ;} elseif ( strpos ( $useragent , 'sosospider' ) !== false){ $bot = 'SOSO' ;} elseif ( strpos ( $useragent , '360spider' ) !== false){ $bot = '360Spider' ;} elseif ( strpos ( $useragent , 'yahoo' ) !== false){ $bot = 'Yahoo' ;} elseif ( strpos ( $useragent , 'msn' ) !== false){ $bot = 'MSN' ;} elseif ( strpos ( $useragent , 'msnbot' ) !== false){ $bot = 'msnbot' ;} elseif ( strpos ( $useragent , 'sohu' ) !== false){ $bot = 'Sohu' ;} elseif ( strpos ( $useragent , 'yodaoBot' ) !== false){ $bot = 'Yodao' ;} elseif ( strpos ( $useragent , 'twiceler' ) !== false){ $bot = 'Twiceler' ;} elseif ( strpos ( $useragent , 'ia_archiver' ) !== false){ $bot = 'Alexa_' ;} elseif ( strpos ( $useragent , 'iaarchiver' ) !== false){ $bot = 'Alexa' ;} elseif ( strpos ( $useragent , 'slurp' ) !== false){ $bot = '雅虎' ;} elseif ( strpos ( $useragent , 'bot' ) !== false){ $bot = '其它蜘蛛' ;} if (isset( $bot )){ $fp = @ fopen ( 'bot.txt' , 'a' ); fwrite( $fp , date ( 'Y-m-d H:i:s' ). "\t" . $_SERVER [ "REMOTE_ADDR" ]. "\t" . $bot . "\t" . 'http://' . $_SERVER [ 'SERVER_NAME' ]. $_SERVER [ "REQUEST_URI" ]. "\r\n" ); fclose( $fp ); } |