1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
|
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2020/02/11 21:44 # @Author : dangxusheng # @Email : dangxusheng163@163.com # @File : download_by_href.py ''' 自动从arxiv.org 下载文献 ''' import os import os.path as osp import requests from lxml import etree from pprint import pprint import re import time import glob headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36" , "Host" : 'arxiv.org' } HREF_CN = 'http://cn.arxiv.org/pdf/' HREF_SRC = 'http://cn.arxiv.org/pdf/' SAVE_PATH = '/media/dangxs/E/Paper/download_at_20200730' os.makedirs(SAVE_PATH, exist_ok = True ) FAIL_URLS = [] FAIL_URLS_TXT = f '{SAVE_PATH}/fail_urls.txt' def download(url, title): pattern = r '[\\/:*?"\'<>|\r\n]+' new_title = re.sub(pattern, " " , title) print (f 'new title: {new_title}' ) save_filepath = '%s/%s.pdf' % (SAVE_PATH, new_title) if osp.exists(save_filepath) and osp.getsize(save_filepath) > 50 * 1024 : print (f 'this pdf is be existed.' ) return True try : with open (save_filepath, 'wb' ) as file : # 分字节下载 r = requests.get(url, stream = True , timeout = None ) for i in r.iter_content( 2048 ): file .write(i) if osp.getsize(save_filepath) > = 10 * 1024 : print ( '%s 下载成功.' % title) return True except Exception as e: print (e) return False # 从arxiv.org 去下载 def search(start_size = 0 , title_keywords = 'Facial Expression' ): # 访问地址: https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1?skip=200&query_id=1c582e6c8afc6146&client_host=cn.arxiv.org req_url = 'https://arxiv.org/search/advanced' req_data = { 'advanced' : 1 , 'terms-0-operator' : 'AND' , 'terms-0-term' : title_keywords, 'terms-0-field' : 'title' , 'classification-computer_science' : 'y' , 'classification-physics_archives' : 'all' , 'classification-include_cross_list' : 'include' , 'date-filter_by' : 'date_range' , # date_range | specific_year # 'date-year': DOWN_YEAR, 'date-year' : '', 'date-from_date' : '2015' , 'date-to_date' : '2020' , 'date-date_type' : 'announced_date_first' , # submitted_date | submitted_date_first | announced_date_first 'abstracts' : 'show' , 'size' : 50 , 'order' : '-announced_date_first' , 'start' : start_size, } res = requests.get(req_url, params = req_data, headers = headers) html = res.content.decode() html = etree.HTML(html) total_text = html.xpath( '//h1[@class="title is-clearfix"]/text()' ) total_text = ' '.join(total_text).replace(' \n ', ' ').lstrip(' ').strip(' ') # i.e. : Showing 1–50 of 355 results num = re.findall( '\d+' , total_text) # Sorry, your query returned no results if len (num) = = 0 : return [], 0 total = int (num[ - 1 ]) # 查询总条数 paper_list = html.xpath( '//ol[@class="breathe-horizontal"]/li' ) info_list = [] for p in paper_list: title = p.xpath( './p[@class="title is-5 mathjax"]//text()' ) title = ' '.join(title).replace(' \n ', ' ').lstrip(' ').strip(' ') href = p.xpath( './div/p/a/@href' )[ 0 ] info_list.append({ 'title' : title, 'href' : href}) return info_list, total # 去指定页面下载 def search_special(): res = requests.get( 'https://gitee.com/weberyoung/the-gan-zoo?_from=gitee_search' ) html = res.content.decode() html = etree.HTML(html) paper_list = html.xpath( '//div[@class="file_content markdown-body"]//li' ) info_list = [] for p in paper_list: title = p.xpath( './/text()' ) title = ' '.join(title).replace(' \n ', ' ').lstrip(' ').strip(' ') href = p.xpath( './a/@href' )[ 0 ] info_list.append({ 'title' : title, 'href' : href}) pprint(info_list) return info_list if __name__ = = '__main__' : page_idx = 0 total = 1000 keywords = 'Facial Action Unit' while page_idx < = total / / 50 : paper_list, total = search(page_idx * 50 , keywords) print (f 'total: {total}' ) if total = = 0 : print ( 'no found .' ) exit( 0 ) for p in paper_list: title = p[ 'title' ] href = HREF_CN + p[ 'href' ].split( '/' )[ - 1 ] + '.pdf' print (href) if not download(href, title): print ( '从国内镜像下载失败,从源地址开始下载 >>>>' ) # 使用国际URL再下载一次 href = HREF_SRC + p[ 'href' ].split( '/' )[ - 1 ] + '.pdf' if not download(href, title): FAIL_URLS.append(p) page_idx + = 1 # 下载最后的部分 last_1 = total - page_idx * 50 paper_list, total = search(last_1, keywords) for p in paper_list: title = p[ 'title' ] href = HREF_CN + p[ 'href' ].split( '/' )[ - 1 ] + '.pdf' if not download(href, title): FAIL_URLS.append(p) time.sleep( 1 ) pprint(FAIL_URLS) with open (FAIL_URLS_TXT, 'a+' ) as f: for item in FAIL_URLS: href = item[ 'href' ] title = item[ 'title' ] f.write(href + '\n' ) print ( 'done.' ) |
以上就是python自动从arxiv下载paper的示例代码的详细内容,更多关于python 从arxiv下载paper的资料请关注服务器之家其它相关文章!
原文链接:https://www.cnblogs.com/dxscode/p/13406238.html