Python爬虫系列之唯品会商品数据采集
如有疑问点击这里联系我们
微信请扫描下方二维码
代码仅供学习交流,请勿用于非法用途
直接上代码
import requestsfrom queue import Queueimport configparserimport jsonimport sysimport execjsimport xlrdimport xlwtimport osimport reimport redisfrom xlutils.copy import copyimport randomimport threadingimport timefrom sign import getHeaders, getHeaders_import tracebackfrom RedisUtils import RedisUtilsfrom urllib import parseimport urllib.parseretry = 3timeout = 20kw = "vipproxy"r = RedisUtils()excelTitle = ["商品标题", "商品编号", "适用季节", "适用性别", "款式", "面料", "尺码", "划线价", "现价", "券后价"]excelPath = os.getcwd() + "/data/"if not os.path.exists(excelPath):os.mkdir(excelPath)cf = configparser.ConfigParser()try:cf.read(os.getcwd() + "/conf.ini", encoding="utf-8-sig")except Exception as e:print(e)print("程序目录下不存在conf.ini配置文件~")sys.exit(0)def getConf(sec, key):try:return cf.get(sec, key)except Exception as e:print(e)print("未得到以下配置:" + sec + " - " + key)sys.exit(0)threadNums = 1try:threadNums = int(getConf("app-sys", "threadNums"))if threadNums <= 0:threadNums = 1except Exception as e:threadNums = 1def getCurrentTime():return str(time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime(time.time())))def getHtml(url, headers, proxies):for i in range(retry):try:resp = requests.get(url, headers=headers, proxies=proxies, timeout=timeout)return resp.content.decode("utf-8")except Exception as e:passdef getProxy():while True:result = r.getLikeKeys(kw)if result and isinstance(result, list) and len(result) > 0:key = random.choice(result)ip = {"http": r.get(key)}r.rm(key)return ipelse:print("ip池暂无任何ip,请检查代理平台账号ip余量是否充足,或 ip池.exe / redis-server 是否启动!")def getCookies():while True:try:vipcookie = r.get("vipcookie")if vipcookie and len(vipcookie) > 0:return vipcookieexcept Exception as e:print("暂未获取到任何cookie数据,请检查redis-server是否启动或 保持.exe 是否启动!")time.sleep(1)def getCookieData(cookieStr, keys):cookieData = {}try:cookieArrs = cookieStr.replace(" ", "").split(";")for i in range(len(cookieArrs)):try:cookieS = cookieArrs[i].split("=")cookieKey = cookieS[0]cookieVal = cookieS[1]if cookieKey in keys:cookieData[cookieKey] = cookieValexcept Exception as e:passexcept Exception as e:passreturn cookieDataclass vipSpider(threading.Thread):def __init__(self, brandQueue, index, *args, **kwargs):super(vipSpider, self).__init__(*args, **kwargs)self.brandQueue = brandQueueself.index = indexself.proxies = getProxy()def updateProxy(self):self.proxies = getProxy()print("线程 %d 更新ip %s " % (self.index, self.proxies))def initExcel(self, path, title):try:f = xlwt.Workbook()sheet1 = f.add_sheet(u'double', cell_overwrite_ok=True)for i in range(0, len(title)):sheet1.write(0, i, title[i])f.save(path)return Trueexcept Exception as e:return Falsedef writeExcel(self, data, path):print("------------------------------------------")print(data)print("------------------------------------------")try:workbook = xlrd.open_workbook(path)sheets = workbook.sheet_names()worksheet = workbook.sheet_by_name(sheets[0])rows_old = worksheet.nrowsnew_workbook = copy(workbook)new_worksheet = new_workbook.get_sheet(0)for j in range(0, len(data)):try:new_worksheet.write(rows_old, j, str(data[j]))except Exception as e:continuenew_workbook.save(path)return Trueexcept Exception as e:passreturn Falsedef getBrandStoreSn(self, brandName):for i in range(retry):headers = getHeaders()url = "/vips-mobile/rest/shopping/pc/search/product/rank"res = getHtml(url, headers, self.proxies)try:pat = pile('"brandStore":{"sn":"(\d+)"')return str(re.findall(pat, res)[0])except Exception as e:self.updateProxy()print("品牌:%s,未成功获取到品牌相关信息,请检查名称是否正确!" % brandName)def getGoodsList(self, brandStoreSn, page):for i in range(retry):headers = getHeaders()pageOffset = str((int(page) - 1) * 120)url = "/vips-mobile/rest/shopping/pc/brandstore/product/rank"res = getHtml(url, headers, self.proxies)try:pat = pile('.*?({.*})')datas = json.loads(re.findall(pat, res)[0])goodsList = datas['data']['productIds']isLast = False if str(datas['data']['isLast']) == "0" else Truereturn goodsList, isLastexcept Exception as e:self.updateProxy()def getPropertyVal(self, key, props):if props and len(props) > 0:for prop in props:try:if key == prop['name']:return prop['value']except Exception as e:passdef getGoodsStocks(self, pid):for i in range(retry):url = "/detail"headers = getHeaders_("/detail", params)res = getHtml(url, headers, self.proxies)try:pat = pile('.*?({.*})')return json.loads(re.findall(pat, res)[0])['items']except Exception as e:self.updateProxy()def getGoodsDetail(self, brandName, pid):skus = self.getGoodsStocks(pid)if skus is None or len(skus) == 0:returndetailData = {}for i in range(retry):headers = getHeaders(str(pid))url = "/vips-mobile/rest/shopping/pc2/product/detail/v5"res = getHtml(url, headers, self.proxies)try:datas = json.loads(res)['data']['product']detailData['title'] = datas['title']detailData['brandName'] = brandNamedetailData['brandIdStr'] = datas['brandIdStr']detailData['merchandiseSn'] = datas['merchandiseSn']try:v = self.getPropertyVal("适用季节", datas['props'])detailData['syjj'] = v if v else ""except Exception as e:detailData['syjj'] = ""try:v = self.getPropertyVal("适用性别", datas['props'])detailData['syxb'] = v if v else ""except Exception as e:detailData['syxb'] = ""try:v = self.getPropertyVal("款式", datas['props'])detailData['ks'] = v if v else ""except Exception as e:detailData['ks'] = ""try:v = self.getPropertyVal("面料", datas['props'])detailData['ml'] = v if v else ""except Exception as e:detailData['ml'] = ""breakexcept Exception as e:self.updateProxy()price = self.getPrice(pid, detailData['brandIdStr'])if price and len(price) > 0:for sku in skus:try:if int(sku['stock']) > 0:data = []data.append(detailData['title'])data.append(detailData['merchandiseSn'])data.append(detailData['syjj'])data.append(detailData['syxb'])data.append(detailData['ks'])data.append(detailData['ml'])data.append(sku['name'])data.append(price['saleMarketPrice'])data.append(price['salePrice'])data.append(price['finalPrice'])self.writeExcel(data, self.excelPath)except Exception as e:passdef run(self):while True:if self.brandQueue.empty():breakbrandName = self.brandQueue.get()brandStoreSn = self.getBrandStoreSn(brandName)if brandStoreSn and len(brandStoreSn) > 0:self.excelPath = excelPath + "data_brand_" + str(brandName).replace(":", "").replace("\\", "").replace("/", "").replace("*", "").replace("?", "").replace("\"", "").replace("<", "").replace(">", "").replace("|", "") + "_t_" + getCurrentTime() + ".xls"s = self.initExcel(self.excelPath, excelTitle)if s:page = 1while True:goodsList, isLast = self.getGoodsList(brandStoreSn, page)if goodsList and len(goodsList) > 0:for goods in goodsList:try:self.getGoodsDetail(brandName, goods)except Exception as e:passif isLast:breakelse:page += 1def getBrandQueue():brandQueue = Queue(0)try:with open("brands.txt", "r", encoding="utf-8") as f:lines = f.readlines()for line in lines:try:line = line.replace("\r", "").replace("\n", "")if line and len(line) > 0:brandQueue.put(line)except Exception as e:passexcept Exception as e:passreturn brandQueuedef main():global threadNumsbrandQueue = getBrandQueue()threadNums = brandQueue.qsize() if threadNums > brandQueue.qsize() else threadNumsfor i in range(threadNums):try:v = vipSpider(brandQueue, i)v.start()except Exception as e:passif __name__ == '__main__':main()