爬取京东商城的商品信息,并将数据以Dataframe形式展示
from selenium.webdriver import Chrome, ChromeOptions
import re
import pymysql
from selenium.webdriver.common.by import By
import collections
import pandas as pd
class Spider():
def __init__(self):
self.browser = None
self.info_diclist = list()
def onepage_info_by_selenium(self, browser, url):
"""获取详情页数据"""
# browser.get("https://item.jd.com/100098751450.html")
browser.get(url)
grand = browser.find_element(by=By.CSS_SELECTOR, value="ul[id='parameter-brand'][class='p-parameter-list']")
temp_dic = dict()
# 品牌单独获取
temp = re.split(r":\s{0,}", grand.text)
temp_dic[temp[0]] = temp[1]
info = browser.find_element(by=By.CSS_SELECTOR, value="ul[class='parameter2 p-parameter-list']")
temp = re.findall(r"\S+:\s{0,}\S+", info.text)
for i in temp:
j = re.split(r":\s{0,}", i)
temp_dic[j[0]] = j[1]
# 获取好评差评
# browser.find_element(by=By.CSS_SELECTOR, value="[data-tab='trigger'][data-anchor='#comment']").click()
self.info_diclist.append(temp_dic)
# input("end: ")
def getmain_by_selenium(self):
"""使用selenium操作主页并写入cookie"""
browser = Chrome()
browser.get('https://www.jd.com')
cookies = self.get_cookie()
for cookie in cookies:
browser.add_cookie(cookie)
return browser
def get_search_result(self, browser, key) -> list:
"""将查询到的所有商品连接保存到列表"""
inputbar = browser.find_element(by=By.ID, value='key')
inputbar.send_keys(key)
browser.find_element(by=By.CSS_SELECTOR, value="[class='button'][aria-label='搜索']").click()
browser.implicitly_wait(5)
hrefs = browser.find_elements(by=By.CSS_SELECTOR, value="div[class='p-img'] a[target='_blank']")
return [href.get_attribute(name='href') for href in hrefs]
def get_info(self):
"""获取详细信息"""
browser = self.getmain_by_selenium()
results = self.get_search_result(browser, "投影仪")
for url in results[:3]:
self.onepage_info_by_selenium(browser, url=url)
self.show_as_dataframe()
input("end: ")
def show_as_dataframe(self):
"""将字典展示为Dataframe"""
data = pd.DataFrame(self.info_diclist)
print(data)
def get_cookie(self):
"""从数据库中获取cookie"""
db = pymysql.connect(
host='127.0.0.1',
user='root',
password='123456',
charset='utf8',
database='draft',
port=3306
)
cursor = db.cursor()
cursor.execute(query="SELECT COOKIE FROM COOKIES WHERE WEB_NAME='jingdong'")
cookie = eval(cursor.fetchall()[0][0])
cursor.close()
db.close()
return cookie
def main():
Spider().get_info()
if __name__ == '__main__':
main()