python实现批量下载新浪博客的方法
本文实例讲述了python实现批量下载新浪博客的方法。分享给大家供大家参考。具体实现方法如下:
#coding=utf-8
importurllib2
importsys,os
importre
importstring
fromBeautifulSoupimportBeautifulSoup
defencode(s):
returns.decode('utf-8').encode(sys.stdout.encoding,'ignore')
defgetHTML(url):
#proxy_handler=urllib2.ProxyHandler({'http':'http://211.138.124.211:80'})
#opener=urllib2.build_opener(proxy_handler)
#urllib2.install_opener(opener)
req=urllib2.Request(url)
response=urllib2.urlopen(req,timeout=15)
returnBeautifulSoup(response,convertEntities=BeautifulSoup.HTML_ENTITIES)
defvisible(element):
'''抓取可见的文本元素'''
ifelement.parent.namein['style','script','[document]','head','title']:
returnFalse
elifre.match('<!--.*-->',str(element)):
returnFalse
elifelement==u'\xa0':
returnFalse
returnTrue
defdelReturn(element):
'''删除元素内的换行'''
returnre.sub('(?<!^)\n+(?!$)','',str(element)).decode('utf-8')
defvalidFilename(filename):
#windows
returnre.sub('[\/:*?<>"|\xa0]','',filename)
defwriteToFile(text,filename,dirname):
ifnotos.path.exists(dirname):
os.makedirs(dirname)
printencode('保存到目录'),dirname
filename=validFilename(filename)
printencode('保存文章'),filename
path=os.path.join(dirname,filename)
ifnotos.path.exists(path):
f=open(path,'w')
f.write(text)
f.close()
else:
printfilename,encode('已经存在')
defformatContent(url,title=''):
'''格式化文章内容'''
page=getHTML(url)
content=page.find('div',{'class':'articalContent'})
art_id=re.search('blog_(\w+)\.html',url).group(1)
blog_name=page.find('span',id='blognamespan').string
iftitle=='':
title=page.find('h2',id=re.compile('^t_')).string
temp_data=filter(visible,content.findAll(text=True))#去掉不可见元素
temp_data=''.join(map(delReturn,temp_data))#删除元素内的换行符
temp_data=temp_data.strip()#删除文章首尾的空行
temp_data=re.sub('\n{2,}','\n\n',temp_data)#删除文章内过多的空行
#输出到文件
#编码问题
temp_data='本文地址:'.decode('utf-8')+url+'\n\n'+temp_data
op_text=temp_data.encode('utf-8')
op_file=title+'_'+art_id+'.txt'
writeToFile(op_text,op_file,blog_name)
defarticlelist(url):
articles={}
page=getHTML(url)
pages=page.find('ul',{'class':'SG_pages'}).span.string
page_num=int(re.search('(\d+)',pages).group(1))
foriinrange(1,page_num+1):
printencode('生成第%d页文章索引'%i)
ifi!=1:
url=re.sub('(_)\d+(\.html)$','\g<1>'+str(i)+'\g<2>',url)
page=getHTML(url)
article=page.findAll('span',{'class':'atc_title'})
forartinarticle:
art_title=art.a['title']
art_href=art.a['href']
articles[art_title]=art_href
returnarticles
defblog_dld(articles):
ifnotisinstance(articles,dict):
returnFalse
printencode('开始下载文章')
forart_title,art_hrefinarticles.items():
formatContent(art_href,art_title)
if__name__=='__main__':
sel=raw_input(encode('你要下载的是(1)全部文章还是(2)单篇文章,输入1或者2:'))
ifsel=='1':
#articlelist_url='http://blog.sina.com.cn/s/articlelist_1303481411_0_1.html'
articlelist_url=raw_input(encode('请输入博客文章目录链接:'))
articles=articlelist(articlelist_url)
blog_dld(articles)
else:
#article_url='http://blog.sina.com.cn/s/blog_4db18c430100gxc5.html'
article_url=raw_input(encode('请输入博客文章链接:'))
formatContent(article_url)
希望本文所述对大家的Python程序设计有所帮助。