162 lines
5.8 KiB
Python
162 lines
5.8 KiB
Python
from requests.exceptions import RequestException
|
||
import requests
|
||
import csv
|
||
from lxml import etree
|
||
import time
|
||
from functions import get_proxy
|
||
|
||
#tt网页
|
||
base_url = "http://www.ttmeishi.com/CaiXi/ZiHanDaoHan/"
|
||
#网页前面的前缀
|
||
base_menu_url = "http://www.ttmeishi.com"
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
|
||
}
|
||
#所有的菜谱
|
||
total_menu=[]
|
||
#现爬的代理池
|
||
proxy_list = get_proxy.get_proxys(10)
|
||
|
||
def get_html(url):
|
||
try:
|
||
proxy = get_proxy.test_proxy(proxy_list)
|
||
r = requests.get(url, headers=headers, proxies= proxy, timeout= 5)
|
||
if r.status_code == 200:
|
||
r.encoding = r.apparent_encoding
|
||
s = etree.HTML(r.text)
|
||
print("菜谱详情请求成功")
|
||
return s
|
||
except RequestException as e:
|
||
print("-----地址错误-----", e)
|
||
|
||
|
||
def get_page(page):#从2开始
|
||
try:
|
||
if page != 1:
|
||
re = requests.get(base_url + "list%d.htm" %page, headers= headers, timeout= 5)
|
||
if re.status_code == 200:
|
||
print("一页ok")
|
||
return re.text
|
||
else:
|
||
print(re.status_code)
|
||
else:
|
||
re = requests.get(base_url, headers = headers, timeout= 5)
|
||
if re.status_code == 200:
|
||
print("一页ok")
|
||
return re.text
|
||
else:
|
||
print(re.status_code)
|
||
except RequestException as e:
|
||
print("asd", e)
|
||
|
||
def get_urls(html):
|
||
menu=[]
|
||
try:
|
||
s = etree.HTML(html)
|
||
urls = s.xpath('//div[@id="content" and @class="content"]'
|
||
'//li[@class="cx_liebiao_li"]'
|
||
'//a/@href')
|
||
for url in urls:
|
||
url = base_menu_url + url
|
||
menu.append(url)
|
||
print(menu)
|
||
return menu
|
||
except RequestException as e:
|
||
print("-----获取菜谱失败-----", e)
|
||
|
||
#获取菜谱得网址
|
||
def get_info(urls):
|
||
total_info = []
|
||
index = 1
|
||
for url in urls:
|
||
if index % 10 == 0:
|
||
time.sleep(10)
|
||
print("大睡" + str(index))
|
||
else:
|
||
time.sleep(2)
|
||
print("小睡" + str(index))
|
||
try:
|
||
s = get_html(url)
|
||
dic = {
|
||
"菜名": "",
|
||
"分类": "",
|
||
"口味": "",
|
||
"食材": "",
|
||
"主要工艺": "",
|
||
"制作时间": "",
|
||
"做法": "",
|
||
"图片url": "",
|
||
}
|
||
dic["菜名"] = s.xpath('//div[@id= "content" and @class= "content"]'
|
||
'//h1/text()')[0]
|
||
classification_info1 = s.xpath('//ul[@class= "fenlei_ul"]//li'
|
||
'//span[@class="fenlei_li_name"]'
|
||
'//a/text()')
|
||
classification_info2 = s.xpath('//ul[@class= "fenlei_ul"]//li'
|
||
'//span[@class="fenlei_li_sm"]/text()')
|
||
for i in range(len(classification_info1)):
|
||
classification_info2[i] = classification_info2[i] + ":" \
|
||
+ classification_info1[i]
|
||
# 分类里有一堆,所以这里采用遍历分开
|
||
for classification in classification_info2:
|
||
if classification[0:2] == "分类":
|
||
dic["分类"] = dic["分类"] + "," + classification[3:]
|
||
dic["分类"].strip(",")
|
||
elif classification[0:4] == "菜品口味":
|
||
dic["口味"] = classification[5:]
|
||
elif classification[2:4] == "时间":
|
||
dic["制作时间"] = classification[5:]
|
||
elif classification[2:4] == "工艺":
|
||
dic["主要工艺"] = classification[5:]
|
||
elif classification[0:2] == "工艺":
|
||
dic["主要工艺"] = classification[3:]
|
||
elif classification[2:4] == "难度":
|
||
pass
|
||
else:
|
||
dic["食材"] = dic["食材"] + "," + classification
|
||
# step_info是菜谱的制作步骤的临时储存,下面好整理
|
||
step_info1 = s.xpath('//div[@class="c_bz_neirong_b0ah"]/text()')
|
||
step_info2 = s.xpath('//div[@class="c_buzhou cbox"]/text()')
|
||
for i in range(len(step_info1)):
|
||
step_info1[i] = str(i + 1) + "." + step_info1[i]
|
||
for i in range(len(step_info2)):
|
||
step_info2[i] = step_info2[i].strip()
|
||
dic["做法"] = step_info1 + step_info2
|
||
dic["图片url"] = s.xpath('//div[@class="cbox c_img"]//img/@src')
|
||
total_info.append(dic)
|
||
print("done" + str(index))
|
||
index = index + 1
|
||
except:
|
||
print(dic["菜名"]+"error")
|
||
return total_info
|
||
|
||
#最终几百道菜的菜谱
|
||
|
||
#写入文件
|
||
def write_csv(menu_list):
|
||
try:
|
||
with open(r"E:\datapy\脏腑调理\自汗盗汗.csv", "w", newline="") as csvfile:
|
||
index = 1
|
||
fieldnames = ["菜名", "分类", "口味", "食材", "主要工艺", "制作时间", "做法", "图片url"]
|
||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||
writer.writeheader()
|
||
for menu in menu_list:
|
||
try:
|
||
writer.writerow(menu)
|
||
print("write" + str(index))
|
||
index = index + 1
|
||
except Exception as e:
|
||
print("文件操作异常,写入失败", e)
|
||
except:
|
||
print("文件操作异常,打开失败")
|
||
|
||
if __name__ == '__main__':
|
||
urls = []
|
||
for i in range(4):
|
||
time.sleep(1)
|
||
print("主函数睡" + str(i))
|
||
html = get_page(i+1)
|
||
urls = get_urls(html) + urls
|
||
m = get_info(urls)
|
||
write_csv(m) |