python自动从arxiv下载paper的示例代码
#!/usr/bin/envpython
#-*-coding:utf-8-*-
#@Time:2020/02/1121:44
#@Author:dangxusheng
#@Email:dangxusheng163@163.com
#@File:download_by_href.py
'''
自动从arxiv.org下载文献
'''
importos
importos.pathasosp
importrequests
fromlxmlimportetree
frompprintimportpprint
importre
importtime
importglob
headers={
"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/80.0.3987.87Safari/537.36",
"Host":'arxiv.org'
}
HREF_CN='http://cn.arxiv.org/pdf/'
HREF_SRC='http://cn.arxiv.org/pdf/'
SAVE_PATH='/media/dangxs/E/Paper/download_at_20200730'
os.makedirs(SAVE_PATH,exist_ok=True)
FAIL_URLS=[]
FAIL_URLS_TXT=f'{SAVE_PATH}/fail_urls.txt'
defdownload(url,title):
pattern=r'[\\/:*?"\'<>|\r\n]+'
new_title=re.sub(pattern,"",title)
print(f'newtitle:{new_title}')
save_filepath='%s/%s.pdf'%(SAVE_PATH,new_title)
ifosp.exists(save_filepath)andosp.getsize(save_filepath)>50*1024:
print(f'thispdfisbeexisted.')
returnTrue
try:
withopen(save_filepath,'wb')asfile:
#分字节下载
r=requests.get(url,stream=True,timeout=None)
foriinr.iter_content(2048):
file.write(i)
ifosp.getsize(save_filepath)>=10*1024:
print('%s下载成功.'%title)
returnTrue
exceptExceptionase:
print(e)
returnFalse
#从arxiv.org去下载
defsearch(start_size=0,title_keywords='FacialExpression'):
#访问地址:https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1?skip=200&query_id=1c582e6c8afc6146&client_host=cn.arxiv.org
req_url='https://arxiv.org/search/advanced'
req_data={
'advanced':1,
'terms-0-operator':'AND',
'terms-0-term':title_keywords,
'terms-0-field':'title',
'classification-computer_science':'y',
'classification-physics_archives':'all',
'classification-include_cross_list':'include',
'date-filter_by':'date_range',#date_range|specific_year
#'date-year':DOWN_YEAR,
'date-year':'',
'date-from_date':'2015',
'date-to_date':'2020',
'date-date_type':'announced_date_first',#submitted_date|submitted_date_first|announced_date_first
'abstracts':'show',
'size':50,
'order':'-announced_date_first',
'start':start_size,
}
res=requests.get(req_url,params=req_data,headers=headers)
html=res.content.decode()
html=etree.HTML(html)
total_text=html.xpath('//h1[@class="titleis-clearfix"]/text()')
total_text=''.join(total_text).replace('\n','').lstrip('').strip('')
#i.e.:Showing1–50of355results
num=re.findall('\d+',total_text)
#Sorry,yourqueryreturnednoresults
iflen(num)==0:return[],0
total=int(num[-1])#查询总条数
paper_list=html.xpath('//ol[@class="breathe-horizontal"]/li')
info_list=[]
forpinpaper_list:
title=p.xpath('./p[@class="titleis-5mathjax"]//text()')
title=''.join(title).replace('\n','').lstrip('').strip('')
href=p.xpath('./div/p/a/@href')[0]
info_list.append({'title':title,'href':href})
returninfo_list,total
#去指定页面下载
defsearch_special():
res=requests.get('https://gitee.com/weberyoung/the-gan-zoo?_from=gitee_search')
html=res.content.decode()
html=etree.HTML(html)
paper_list=html.xpath('//div[@class="file_contentmarkdown-body"]//li')
info_list=[]
forpinpaper_list:
title=p.xpath('.//text()')
title=''.join(title).replace('\n','').lstrip('').strip('')
href=p.xpath('./a/@href')[0]
info_list.append({'title':title,'href':href})
pprint(info_list)
returninfo_list
if__name__=='__main__':
page_idx=0
total=1000
keywords='FacialActionUnit'
whilepage_idx<=total//50:
paper_list,total=search(page_idx*50,keywords)
print(f'total:{total}')
iftotal==0:
print('nofound.')
exit(0)
forpinpaper_list:
title=p['title']
href=HREF_CN+p['href'].split('/')[-1]+'.pdf'
print(href)
ifnotdownload(href,title):
print('从国内镜像下载失败,从源地址开始下载>>>>')
#使用国际URL再下载一次
href=HREF_SRC+p['href'].split('/')[-1]+'.pdf'
ifnotdownload(href,title):
FAIL_URLS.append(p)
page_idx+=1
#下载最后的部分
last_1=total-page_idx*50
paper_list,total=search(last_1,keywords)
forpinpaper_list:
title=p['title']
href=HREF_CN+p['href'].split('/')[-1]+'.pdf'
ifnotdownload(href,title):
FAIL_URLS.append(p)
time.sleep(1)
pprint(FAIL_URLS)
withopen(FAIL_URLS_TXT,'a+')asf:
foriteminFAIL_URLS:
href=item['href']
title=item['title']
f.write(href+'\n')
print('done.')
以上就是python自动从arxiv下载paper的示例代码的详细内容,更多关于python从arxiv下载paper的资料请关注毛票票其它相关文章!