2000字范文,分享全网优秀范文,学习好帮手!
2000字范文 > Python爬虫系列之唯品会商品数据采集

Python爬虫系列之唯品会商品数据采集

时间:2021-09-15 00:34:40

相关推荐

Python爬虫系列之唯品会商品数据采集

Python爬虫系列之唯品会商品数据采集

如有疑问点击这里联系我们

微信请扫描下方二维码

代码仅供学习交流,请勿用于非法用途

直接上代码

import requestsfrom queue import Queueimport configparserimport jsonimport sysimport execjsimport xlrdimport xlwtimport osimport reimport redisfrom xlutils.copy import copyimport randomimport threadingimport timefrom sign import getHeaders, getHeaders_import tracebackfrom RedisUtils import RedisUtilsfrom urllib import parseimport urllib.parseretry = 3timeout = 20kw = "vipproxy"r = RedisUtils()excelTitle = ["商品标题", "商品编号", "适用季节", "适用性别", "款式", "面料", "尺码", "划线价", "现价", "券后价"]excelPath = os.getcwd() + "/data/"if not os.path.exists(excelPath):os.mkdir(excelPath)cf = configparser.ConfigParser()try:cf.read(os.getcwd() + "/conf.ini", encoding="utf-8-sig")except Exception as e:print(e)print("程序目录下不存在conf.ini配置文件~")sys.exit(0)def getConf(sec, key):try:return cf.get(sec, key)except Exception as e:print(e)print("未得到以下配置:" + sec + " - " + key)sys.exit(0)threadNums = 1try:threadNums = int(getConf("app-sys", "threadNums"))if threadNums <= 0:threadNums = 1except Exception as e:threadNums = 1def getCurrentTime():return str(time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime(time.time())))def getHtml(url, headers, proxies):for i in range(retry):try:resp = requests.get(url, headers=headers, proxies=proxies, timeout=timeout)return resp.content.decode("utf-8")except Exception as e:passdef getProxy():while True:result = r.getLikeKeys(kw)if result and isinstance(result, list) and len(result) > 0:key = random.choice(result)ip = {"http": r.get(key)}r.rm(key)return ipelse:print("ip池暂无任何ip,请检查代理平台账号ip余量是否充足,或 ip池.exe / redis-server 是否启动!")def getCookies():while True:try:vipcookie = r.get("vipcookie")if vipcookie and len(vipcookie) > 0:return vipcookieexcept Exception as e:print("暂未获取到任何cookie数据,请检查redis-server是否启动或 保持.exe 是否启动!")time.sleep(1)def getCookieData(cookieStr, keys):cookieData = {}try:cookieArrs = cookieStr.replace(" ", "").split(";")for i in range(len(cookieArrs)):try:cookieS = cookieArrs[i].split("=")cookieKey = cookieS[0]cookieVal = cookieS[1]if cookieKey in keys:cookieData[cookieKey] = cookieValexcept Exception as e:passexcept Exception as e:passreturn cookieDataclass vipSpider(threading.Thread):def __init__(self, brandQueue, index, *args, **kwargs):super(vipSpider, self).__init__(*args, **kwargs)self.brandQueue = brandQueueself.index = indexself.proxies = getProxy()def updateProxy(self):self.proxies = getProxy()print("线程 %d 更新ip %s " % (self.index, self.proxies))def initExcel(self, path, title):try:f = xlwt.Workbook()sheet1 = f.add_sheet(u'double', cell_overwrite_ok=True)for i in range(0, len(title)):sheet1.write(0, i, title[i])f.save(path)return Trueexcept Exception as e:return Falsedef writeExcel(self, data, path):print("------------------------------------------")print(data)print("------------------------------------------")try:workbook = xlrd.open_workbook(path)sheets = workbook.sheet_names()worksheet = workbook.sheet_by_name(sheets[0])rows_old = worksheet.nrowsnew_workbook = copy(workbook)new_worksheet = new_workbook.get_sheet(0)for j in range(0, len(data)):try:new_worksheet.write(rows_old, j, str(data[j]))except Exception as e:continuenew_workbook.save(path)return Trueexcept Exception as e:passreturn Falsedef getBrandStoreSn(self, brandName):for i in range(retry):headers = getHeaders()url = "/vips-mobile/rest/shopping/pc/search/product/rank"res = getHtml(url, headers, self.proxies)try:pat = pile('"brandStore":{"sn":"(\d+)"')return str(re.findall(pat, res)[0])except Exception as e:self.updateProxy()print("品牌:%s,未成功获取到品牌相关信息,请检查名称是否正确!" % brandName)def getGoodsList(self, brandStoreSn, page):for i in range(retry):headers = getHeaders()pageOffset = str((int(page) - 1) * 120)url = "/vips-mobile/rest/shopping/pc/brandstore/product/rank"res = getHtml(url, headers, self.proxies)try:pat = pile('.*?({.*})')datas = json.loads(re.findall(pat, res)[0])goodsList = datas['data']['productIds']isLast = False if str(datas['data']['isLast']) == "0" else Truereturn goodsList, isLastexcept Exception as e:self.updateProxy()def getPropertyVal(self, key, props):if props and len(props) > 0:for prop in props:try:if key == prop['name']:return prop['value']except Exception as e:passdef getGoodsStocks(self, pid):for i in range(retry):url = "/detail"headers = getHeaders_("/detail", params)res = getHtml(url, headers, self.proxies)try:pat = pile('.*?({.*})')return json.loads(re.findall(pat, res)[0])['items']except Exception as e:self.updateProxy()def getGoodsDetail(self, brandName, pid):skus = self.getGoodsStocks(pid)if skus is None or len(skus) == 0:returndetailData = {}for i in range(retry):headers = getHeaders(str(pid))url = "/vips-mobile/rest/shopping/pc2/product/detail/v5"res = getHtml(url, headers, self.proxies)try:datas = json.loads(res)['data']['product']detailData['title'] = datas['title']detailData['brandName'] = brandNamedetailData['brandIdStr'] = datas['brandIdStr']detailData['merchandiseSn'] = datas['merchandiseSn']try:v = self.getPropertyVal("适用季节", datas['props'])detailData['syjj'] = v if v else ""except Exception as e:detailData['syjj'] = ""try:v = self.getPropertyVal("适用性别", datas['props'])detailData['syxb'] = v if v else ""except Exception as e:detailData['syxb'] = ""try:v = self.getPropertyVal("款式", datas['props'])detailData['ks'] = v if v else ""except Exception as e:detailData['ks'] = ""try:v = self.getPropertyVal("面料", datas['props'])detailData['ml'] = v if v else ""except Exception as e:detailData['ml'] = ""breakexcept Exception as e:self.updateProxy()price = self.getPrice(pid, detailData['brandIdStr'])if price and len(price) > 0:for sku in skus:try:if int(sku['stock']) > 0:data = []data.append(detailData['title'])data.append(detailData['merchandiseSn'])data.append(detailData['syjj'])data.append(detailData['syxb'])data.append(detailData['ks'])data.append(detailData['ml'])data.append(sku['name'])data.append(price['saleMarketPrice'])data.append(price['salePrice'])data.append(price['finalPrice'])self.writeExcel(data, self.excelPath)except Exception as e:passdef run(self):while True:if self.brandQueue.empty():breakbrandName = self.brandQueue.get()brandStoreSn = self.getBrandStoreSn(brandName)if brandStoreSn and len(brandStoreSn) > 0:self.excelPath = excelPath + "data_brand_" + str(brandName).replace(":", "").replace("\\", "").replace("/", "").replace("*", "").replace("?", "").replace("\"", "").replace("<", "").replace(">", "").replace("|", "") + "_t_" + getCurrentTime() + ".xls"s = self.initExcel(self.excelPath, excelTitle)if s:page = 1while True:goodsList, isLast = self.getGoodsList(brandStoreSn, page)if goodsList and len(goodsList) > 0:for goods in goodsList:try:self.getGoodsDetail(brandName, goods)except Exception as e:passif isLast:breakelse:page += 1def getBrandQueue():brandQueue = Queue(0)try:with open("brands.txt", "r", encoding="utf-8") as f:lines = f.readlines()for line in lines:try:line = line.replace("\r", "").replace("\n", "")if line and len(line) > 0:brandQueue.put(line)except Exception as e:passexcept Exception as e:passreturn brandQueuedef main():global threadNumsbrandQueue = getBrandQueue()threadNums = brandQueue.qsize() if threadNums > brandQueue.qsize() else threadNumsfor i in range(threadNums):try:v = vipSpider(brandQueue, i)v.start()except Exception as e:passif __name__ == '__main__':main()

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。