1 爬取89ip代理
2 爬取豆瓣电影
1 爬取89ip代理
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from requests.exceptions import ProxyError
class SpiderIP:
def __init__(self):
# 定义目标地址哦
self.tag_url = "https://www.89ip.cn/"
self.headers = {
"User-Agent": UserAgent().random
}
def spider_index_response(self):
response = requests.get(url=self.tag_url, headers=self.headers)
return response.text
def create_soup(self):
return BeautifulSoup(self.spider_index_response(), 'lxml')
def spider_ip_port(self):
soup = self.create_soup()
tr_list = soup.select('div.layui-row.layui-col-space15 > div.layui-col-md8 > div > div.layui-form > table > tbody > tr')
data_list = []
for tr in tr_list:
td_list = tr.find_all("td")
ip = td_list[0].text.strip()
port = td_list[1].string.strip()
store = td_list[3].get_text().strip()
# {"http":"http://IP:PORT"}
data_list.append({"store": store, "proxies": {
"http": f"http://{ip}:{port}"
}})
return data_list
def __spider_baidu(self, proxies):
try:
response = requests.get("http://httpbin.org/get", headers=self.headers, proxies=proxies, timeout=2)
# 检查请求是否成功
if response.status_code == 200:
# 处理响应内容
response.encoding = 'utf-8' # 设置响应内容的编码格式为utf-8
# 解析JSON结果
data = response.text # 获取响应信息
print(data)
else:
print("请求失败:", response.status_code)
except ProxyError:
pass
def test_ip(self):
data_list = self.spider_ip_port()
for index, data in enumerate(data_list, start=1):
store = data.get("store")
proxies = data.get("proxies")
print(f"这是第 {index} 条数据! 运营商是 :>>> {store}")
proxies = self.__spider_baidu(proxies=proxies)
if proxies:
print(f"当前代理可用")
else:
print(f"已废除")
def main(self):
self.test_ip()
if __name__ == '__main__':
s = SpiderIP()
s.main()
2 爬取豆瓣电影
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from lxml import etree
class SpiderBase:
def __init__(self):
self.tag_url_list = []
self.headers = {
"User-Agent": UserAgent().random
}
class SpiderTopSoup(SpiderBase):
def __init__(self):
super().__init__()
self.tag_url_list = self.__create_tag_url_list()
def __create_tag_url_list(self):
tag_url_list = []
for i in range(0, int(250 / 25)):
if i == 0:
tag_url = "https://movie.douban.com/top250"
tag_url_list.append(tag_url)
else:
tag_url = f"https://movie.douban.com/top250?start={i * 25}"
tag_url_list.append(tag_url)
return tag_url_list
def __create_soup(self, page_text):
return BeautifulSoup(page_text, 'lxml')
def __spider_detail_data(self, soup):
data_list = []
div_list = soup.find_all("div", class_="item")
for div in div_list:
#
pic_div = div.find("div", class_="pic")
# 封面图链接地址
img_url = pic_div.a.img.get("src")
# 排名
level = pic_div.em.text
# 详情链接
detail_url = pic_div.a.get("href")
bd_a_span_list = div.find("div", class_="info").find("div", class_="hd").a.find_all("span")
try:
title = bd_a_span_list[0].text
except:
title = ""
try:
title_eg = bd_a_span_list[1].text
except:
title_eg = ""
try:
title_desc = bd_a_span_list[2].text
except:
title_desc = ""
bd_div = div.find("div", class_="info").find("div", class_="bd")
# 导演和上映日期
action, publish_date = [data.replace("\xa0", "").strip() for data in bd_div.p.text.strip().split("\n")]
# 评分 和 评价
span_list = bd_div.find("div", class_="star").find_all("span")
score = span_list[1].text
comment_num = span_list[-1].text[0:-3]
# 格言
try:
quote = bd_div.find("p", class_="quote").span.text
except:
quote = ""
data_list.append({
"title": title,
"title_eg": title_eg,
"title_desc": title_desc,
"img_url": img_url,
"level": level,
"detail_url": detail_url,
"action": action,
"publish_date": publish_date,
"score": score,
"comment_num": comment_num,
"quote": quote,
})
print(data_list)
return data_list
def spider_index_data(self, tag_url):
response = requests.get(url=tag_url, headers=self.headers)
soup = self.__create_soup(page_text=response.text)
return self.__spider_detail_data(soup=soup)
def main(self):
data_list_all = []
for tag_url in self.tag_url_list:
data_list = self.spider_index_data(tag_url=tag_url)
data_list_all.extend(data_list)
print(len(data_list_all))
if __name__ == '__main__':
s = SpiderTopSoup()
s.main()
版本2(建议用这个)
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from lxml import etree
movie_dict = {
"title": '电影名',
"title_eg": '英文名',
"title_desc": '简介',
"img_url": '图片链接',
"level": '级别',
"detail_url": '播放地址',
"action": '导演和演员',
"publish_date": '播放日期',
"score": '评分',
"comment_num": '评论数',
"quote": '格言',
}
class SpiderBase:
def __init__(self):
self.tag_url_list = []
self.headers = {
"User-Agent": UserAgent().random
}
class SpiderTopSoup(SpiderBase):
def __init__(self):
super().__init__()
self.tag_url_list = self.__create_tag_url_list()
def __create_tag_url_list(self):
tag_url_list = []
for i in range(0, int(250 / 25)):
if i == 0:
tag_url = "https://movie.douban.com/top250"
tag_url_list.append(tag_url)
else:
tag_url = f"https://movie.douban.com/top250?start={i * 25}"
tag_url_list.append(tag_url)
return tag_url_list
def __create_soup(self, page_text):
return BeautifulSoup(page_text, 'lxml')
def __spider_detail_data(self, soup):
data_list = []
div_list = soup.find_all("div", class_="item")
for div in div_list:
#
pic_div = div.find("div", class_="pic")
# 封面图链接地址
img_url = pic_div.a.img.get("src")
# 排名
level = pic_div.em.text
# 详情链接
detail_url = pic_div.a.get("href")
bd_a_span_list = div.find("div", class_="info").find("div", class_="hd").a.find_all("span")
try:
title = bd_a_span_list[0].text
except:
title = ""
try:
title_eg = bd_a_span_list[1].text
except:
title_eg = ""
try:
title_desc = bd_a_span_list[2].text
except:
title_desc = ""
bd_div = div.find("div", class_="info").find("div", class_="bd")
# 导演和上映日期
action, publish_date = [data.replace("\xa0", "").strip() for data in bd_div.p.text.strip().split("\n")]
# 评分 和 评价
span_list = bd_div.find("div", class_="star").find_all("span")
score = span_list[1].text
comment_num = span_list[-1].text[0:-3]
# 格言
try:
quote = bd_div.find("p", class_="quote").span.text
except:
quote = ""
data_dict = {
"title": title,
"title_eg": title_eg,
"title_desc": title_desc,
"img_url": img_url,
"level": level,
"detail_url": detail_url,
"action": action,
"publish_date": publish_date,
"score": score,
"comment_num": comment_num,
"quote": quote,
}
for key, value in movie_dict.items():
new_dict = f"{value}: {data_dict[key]}"
data_list.append(new_dict)
print(data_list)
return data_list
def spider_index_data(self, tag_url):
response = requests.get(url=tag_url, headers=self.headers)
soup = self.__create_soup(page_text=response.text)
return self.__spider_detail_data(soup=soup)
def main(self):
data_list_all = []
for tag_url in self.tag_url_list:
data_list = self.spider_index_data(tag_url=tag_url)
data_list_all.extend(data_list)
print(len(data_list_all))
if __name__ == '__main__':
s = SpiderTopSoup()
s.main()