import requests import re from urllib import request import urllib.request import pymysql import time conn=pymysql.connect(host="127.0.0.1",user="root",passwd="123456",db="world") def get_url(): for i in range(5,11): headers1 = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36' } url='https://s.taobao.com/search?spm=a21bo.2017.201856-fline.2.1b3311d9sSXobt&q=%E5%9B%9B%E4%BB%B6%E5%A5%97&s='+str(i*44)#链接入口 response1 = requests.get(url, headers=headers1) data = response1.text bt_1 = '"raw_title":"(.*?)","pic_url"' tp_1='"pic_url":"//(.*?)"' spid_1='"nid":"(.*?)","category"' xl_1='"view_sales":"(.*?)"' dm_1= '"nick"."(.*?)"' jg_1='"view_price"."(.*?)","view_fee"' user_id1='"user_id":"(.*?)","nick":".*?"' comment_url1='"detail_url":"(.*?)"' bt = re.compile(bt_1).findall(str(data)) tp=re.compile(tp_1).findall(str(data)) spid=re.compile(spid_1).findall(str(data)) print(spid) xl=re.compile(xl_1).findall(str(data)) dm=re.compile(dm_1).findall(str(data)) jg=re.compile(jg_1).findall(str(data)) for j in range(0,len(bt)): bt1=bt[j] tp1='https://'+tp[j] spid1=spid[j] xl1=xl[j] dm1=dm[j] headers = { 'Referer': 'https://item.taobao.com/item.htm?spm=a1z10.5-c-s.w4002-18518582505.20.6d887041nVz3D2&id='+spid1 ,#必须加上这个 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36' } response = requests.get( 'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId='+spid1+'&sellerId=102291787&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,activity,fqg,zjys,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess',#这个网址藏了销量数据 headers=headers) data3 = response.text sold = '"sellCountDO":{"sellCount":"(.*?)","success":true}'#正则匹配销量 soldTotalCount = re.compile(sold).findall(data3) print('第' + str(i+1) + "页" + '第' + str(j+1) + '个') print(bt1) print(spid1) print(soldTotalCount) jg1 = jg[j] sql = "insert into taobaopc1(bt,tp,spid,xl,dm,jg)values('" + bt1 + "','" + tp1 + "','" + spid1 + "','" + soldTotalCount1 + "','" + dm1 + "','" + jg1 + "')" print(sql) conn.query(sql) conn.commit()