Python下载懒人图库JavaScript特效

2024-03-31 06:39:05 385

#!/usr/bin/python #-*-coding:utf-8-*- importurllib,os,sys importgevent,re fromgeventimportmonkey frombs4importBeautifulSoup gevent.monkey.patch_socket() ''' Description：Python爬虫抓取懒人图库的JS脚本模板 Author：admin Create-Date：2015-05-25 Version：1.0 ''' HTTP_URL='http://www.lanrentuku.com%s' DOWNLOAD_URL=HTTP_URL[:-2]+'/js/d%szip' reg=r'\d{1,}\.+' defencode(text): returntext.encode("utf8") defcreateDirectory(curPath): myPath=os.path.join(getSubDirectory(),u'JS代码模板') ifnotos.path.exists(myPath): os.mkdir(myPath) returnos.path.join(myPath,curPath) defgetSubDirectory(): returnos.getcwd() defschedule(a,b,c): per=100.0*a*b/c ifper>100: per=100 sys.stdout.write('%.1f%%\r'%per) sys.stdout.flush() defgeturllist(url): url_list={} html=urllib.urlopen(url) content=html.read() html.close() #用BeautifulSoup解析 decodeHtml=BeautifulSoup(content) try: aTags=decodeHtml.find_all('div',{'class':'list-pngjs'})[0].find_all('a') exceptIndexError,e: printe aTags=None #获取链接地址和标题 ifaTagsisnotNone: fora_taginaTags: url_list[HTTP_URL%a_tag.get('href')]=a_tag.get_text() returnurl_list defdownload(down_url): try: m=re.search(reg,down_url[0]) name=DOWNLOAD_URL%m.group(0) urllib.urlretrieve(name,createDirectory(down_url[1]+name[-4:]),schedule) exceptException,e: printe.message defgetpageurl(xUrl): #进行列表页循环 return[xUrl%pageforpageinxrange(1,49)] if__name__=='__main__': jobs=[] pageurl=getpageurl('http://www.lanrentuku.com/js/p%s.html') #爬取所有链接 foriinpageurl: forkingeturllist(i).items(): jobs.append(gevent.spawn(download,k)) gevent.joinall(jobs)

Python下载懒人图库JavaScript特效

热门推荐

随机推荐