Python下使用Scrapy爬取网页内容的实例
上周用了一周的时间学习了Python和Scrapy,实现了从0到1完整的网页爬虫实现。研究的时候很痛苦,但是很享受,做技术的嘛。
首先,安装Python,坑太多了,一个个爬。由于我是windows环境,没钱买mac,在安装的时候遇到各种各样的问题,确实各种各样的依赖。
安装教程不再赘述。如果在安装的过程中遇到ERROR:需要windowsc/c++问题,一般是由于缺少windows开发编译环境,晚上大多数教程是安装一个VisualStudio,太不靠谱了,事实上只要安装一个WindowsSDK就可以了。
下面贴上我的爬虫代码:
爬虫主程序:
#-*-coding:utf-8-*- importscrapy fromscrapy.httpimportRequest fromzjf.FsmzItemsimportFsmzItem fromscrapy.selectorimportSelector #圈圈:情感生活 classMySpider(scrapy.Spider): #爬虫名 name="MySpider" #设定域名 allowed_domains=["nvsheng.com"] #爬取地址 start_urls=[] #flag x=0 #爬取方法 defparse(self,response): item=FsmzItem() sel=Selector(response) item['title']=sel.xpath('//h1/text()').extract() item['text']=sel.xpath('//*[@class="content"]/p/text()').extract() item['imags']=sel.xpath('//div[@id="content"]/p/a/img/@src|//div[@id="content"]/p/img/@src').extract() ifMySpider.x==0: page_list=MySpider.getUrl(self,response) forpage_singleinpage_list: yieldRequest(page_single) MySpider.x+=1 yielditem #init:动态传入参数 #命令行传参写法:scrapycrawlMySpider-astart_url="http://some_url" def__init__(self,*args,**kwargs): super(MySpider,self).__init__(*args,**kwargs) self.start_urls=[kwargs.get('start_url')] defgetUrl(self,response): url_list=[] select=Selector(response) page_list_tmp=select.xpath('//div[@class="viewnewpages"]/a[not(@class="next")]/@href').extract() forpage_tmpinpage_list_tmp: ifpage_tmpnotinurl_list: url_list.append("http://www.nvsheng.com/emotion/px/"+page_tmp) returnurl_list
PipeLines类
#-*-coding:utf-8-*- #Defineyouritempipelineshere # #Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting #See:http://doc.scrapy.org/en/latest/topics/item-pipeline.html fromzjfimportsettings importjson,os,re,random importurllib.request importrequests,json fromrequests_toolbelt.multipart.encoderimportMultipartEncoder classMyPipeline(object): flag=1 post_title='' post_text=[] post_text_imageUrl_list=[] cs=[] user_id='' def__init__(self): MyPipeline.user_id=MyPipeline.getRandomUser('37619,18441390,18441391') #processthedata defprocess_item(self,item,spider): #获取随机user_id,模拟发帖 user_id=MyPipeline.user_id #获取正文text_str_tmp text=item['text'] text_str_tmp="" forstrintext: text_str_tmp=text_str_tmp+str #print(text_str_tmp) #获取标题 ifMyPipeline.flag==1: title=item['title'] MyPipeline.post_title=MyPipeline.post_title+title[0] #保存并上传图片 text_insert_pic='' text_insert_pic_w='' text_insert_pic_h='' forimag_urlinitem['imags']: img_name=imag_url.replace('/','').replace('.','').replace('|','').replace(':','') pic_dir=settings.IMAGES_STORE+'%s.jpg'%(img_name) urllib.request.urlretrieve(imag_url,pic_dir) #图片上传,返回json upload_img_result=MyPipeline.uploadImage(pic_dir,'image/jpeg') #获取json中保存图片路径 text_insert_pic=upload_img_result['result']['image_url'] text_insert_pic_w=upload_img_result['result']['w'] text_insert_pic_h=upload_img_result['result']['h'] #拼接json ifMyPipeline.flag==1: cs_json={"c":text_str_tmp,"i":"","w":text_insert_pic_w,"h":text_insert_pic_h} else: cs_json={"c":text_str_tmp,"i":text_insert_pic,"w":text_insert_pic_w,"h":text_insert_pic_h} MyPipeline.cs.append(cs_json) MyPipeline.flag+=1 returnitem #spider开启时被调用 defopen_spider(self,spider): pass #sipder关闭时被调用 defclose_spider(self,spider): strcs=json.dumps(MyPipeline.cs) jsonData={"apisign":"99ea3eda4b45549162c4a741d58baa60","user_id":MyPipeline.user_id,"gid":30,"t":MyPipeline.post_title,"cs":strcs} MyPipeline.uploadPost(jsonData) #上传图片 defuploadImage(img_path,content_type): "uploadImagefunctions" #UPLOAD_IMG_URL="http://api.qa.douguo.net/robot/uploadpostimage" UPLOAD_IMG_URL="http://api.douguo.net/robot/uploadpostimage" #传图片 #imgPath='D:\pics\http___img_nvsheng_com_uploads_allimg_170119_18-1f1191g440_jpg.jpg' m=MultipartEncoder( #fields={'user_id':'192323', #'images':('filename',open(imgPath,'rb'),'image/JPEG')} fields={'user_id':MyPipeline.user_id, 'apisign':'99ea3eda4b45549162c4a741d58baa60', 'image':('filename',open(img_path,'rb'),'image/jpeg')} ) r=requests.post(UPLOAD_IMG_URL,data=m,headers={'Content-Type':m.content_type}) returnr.json() defuploadPost(jsonData): CREATE_POST_URL=http://api.douguo.net/robot/uploadimagespost
reqPost=requests.post(CREATE_POST_URL,data=jsonData)
defgetRandomUser(userStr): user_list=[] user_chooesd='' foruser_idinstr(userStr).split(','): user_list.append(user_id) userId_idx=random.randint(1,len(user_list)) user_chooesd=user_list[userId_idx-1] returnuser_chooesd
字段保存Items类
#-*-coding:utf-8-*- #Defineherethemodelsforyourscrapeditems # #Seedocumentationin: #http://doc.scrapy.org/en/latest/topics/items.html importscrapy classFsmzItem(scrapy.Item): #definethefieldsforyouritemherelike: #name=scrapy.Field() title=scrapy.Field() #tutor=scrapy.Field() #strongText=scrapy.Field() text=scrapy.Field() imags=scrapy.Field()
在命令行里键入
scrapycrawlMySpider-astart_url=www.aaa.com
这样就可以爬取aaa.com下的内容了
以上这篇Python下使用Scrapy爬取网页内容的实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。