0
下午好, 我嘗試從Web中獲取數據並在SQLServer中存儲數據。使用lib pymssql,連接已建立。但是當處理項目時,錯誤「太多的解包值」出來了,所以我也附加了MyItem類。我看不到明顯的錯誤? 這裏是代碼pipelines.py當將scrapy管道數據寫入數據庫時,錯誤值太高以致無法解壓
-*- coding: utf-8 -*-
import pymssql
from scrapy import signals
import json
import codecs
class MyPipeline(object):
def __init__(self):
self.conn = pymssql.connect(host=r".\\MyPC",user='sa',password='XXXX',database='Webmining')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.executemany("INSERT INTO RecruitInformation(recruitNumber,name,detailLink,publishTime,catalog,worklocation) VALUES (%d,%s,%s,%t,%s,%s)",(item['recruitNumber'],item['name'],item['detailLink'],item['publishTime'],item['catalog'],item['worklocation']))
self.conn.commit()
except pymssql.InterfaceError, e:
print ("pymssql.InterfaceError")
except pymssql.DataError, e:
print ("pymssql.DataError")
except pymssql.OperationalError, e:
print ("pymssql.OperationalError")
except pymssql.IntegrityError, e:
print ("pymssql.IntegrityError")
except pymssql.InternalError, e:
print ("pymssql.InternalError")
except pymssql.ProgrammingError, e:
print ("pymssql.ProgrammingError")
except pymssql.NotSupportedError, e:
print ("pymssql.NotSupportedError")
return item
def spider_closed(self, spider):
self.conn.close()
//the code in item.py is as follow
import scrapy
from scrapy.item import Item, Field
class MyItem(Item):
name = Field()
catalog = Field()
workLocation = Field()
recruitNumber = Field()
detailLink = Field()
publishTime = Field()
class MySpider(CrawlSpider):
name = "xxxx"
allowed_domains = ["xxxx.com"]
start_urls = [ "http://xx.xxxx.com/position.php"]
rules = [Rule(sle(allow=("/position.php\?&start=\d{,4}#a")), follow=True,callback='parse_item')]
def parse_item(self, response):
items = []
sel = Selector(response)
base_url = get_base_url(response)
sites_even = sel.css('table.tablelist tr.even')
for site in sites_even:
item = MyItem()
item['name'] = site.css('.l.square a').xpath('text()').extract()
relative_url = site.css('.l.square a').xpath('@href').extract()[0]
item['detailLink'] = urljoin_rfc(base_url, relative_url)
item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
items.append(item)
sites_odd = sel.css('table.tablelist tr.odd')
for site in sites_odd:
item = MyItem()
item['name'] = site.css('.l.square a').xpath('text()').extract()
relative_url = site.css('.l.square a').xpath('@href').extract()[0]
item['detailLink'] = urljoin_rfc(base_url, relative_url)
item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()
item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()
item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()
items.append(item)
return items
def _process_request(self, request):
info('process ' + str(request))
return request
由於我不熟悉Scrapy,只能看到item的定義,它是字典式的,並且不僅包含一個值。所以它不能在sql中使用。但如何讓它一個接一個地取出? – SnailBai
在這種情況下,填充元素的代碼將非常重要,因爲我認爲您可以用列表填充一個「Field」而不是單個值。請編輯您的問題,並從蜘蛛中添加代碼。 – GHajba
我已經添加了蜘蛛代碼,我也用executemany()取代了函數execute,它可以處理字典@GHajba – SnailBai