Files
jiangyuwei666 d8ae182023 web server done
2019-03-04 20:45:43 +08:00

162 lines
5.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from requests.exceptions import RequestException
import requests
import csv
from lxml import etree
import time
from functions import get_proxy
#tt网页
base_url = "http://www.ttmeishi.com/CaiXi/ZiHanDaoHan/"
#网页前面的前缀
base_menu_url = "http://www.ttmeishi.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
#所有的菜谱
total_menu=[]
#现爬的代理池
proxy_list = get_proxy.get_proxys(10)
def get_html(url):
try:
proxy = get_proxy.test_proxy(proxy_list)
r = requests.get(url, headers=headers, proxies= proxy, timeout= 5)
if r.status_code == 200:
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
print("菜谱详情请求成功")
return s
except RequestException as e:
print("-----地址错误-----", e)
def get_page(page):#从2开始
try:
if page != 1:
re = requests.get(base_url + "list%d.htm" %page, headers= headers, timeout= 5)
if re.status_code == 200:
print("一页ok")
return re.text
else:
print(re.status_code)
else:
re = requests.get(base_url, headers = headers, timeout= 5)
if re.status_code == 200:
print("一页ok")
return re.text
else:
print(re.status_code)
except RequestException as e:
print("asd", e)
def get_urls(html):
menu=[]
try:
s = etree.HTML(html)
urls = s.xpath('//div[@id="content" and @class="content"]'
'//li[@class="cx_liebiao_li"]'
'//a/@href')
for url in urls:
url = base_menu_url + url
menu.append(url)
print(menu)
return menu
except RequestException as e:
print("-----获取菜谱失败-----", e)
#获取菜谱得网址
def get_info(urls):
total_info = []
index = 1
for url in urls:
if index % 10 == 0:
time.sleep(10)
print("大睡" + str(index))
else:
time.sleep(2)
print("小睡" + str(index))
try:
s = get_html(url)
dic = {
"菜名": "",
"分类": "",
"口味": "",
"食材": "",
"主要工艺": "",
"制作时间": "",
"做法": "",
"图片url": "",
}
dic["菜名"] = s.xpath('//div[@id= "content" and @class= "content"]'
'//h1/text()')[0]
classification_info1 = s.xpath('//ul[@class= "fenlei_ul"]//li'
'//span[@class="fenlei_li_name"]'
'//a/text()')
classification_info2 = s.xpath('//ul[@class= "fenlei_ul"]//li'
'//span[@class="fenlei_li_sm"]/text()')
for i in range(len(classification_info1)):
classification_info2[i] = classification_info2[i] + ":" \
+ classification_info1[i]
# 分类里有一堆,所以这里采用遍历分开
for classification in classification_info2:
if classification[0:2] == "分类":
dic["分类"] = dic["分类"] + "," + classification[3:]
dic["分类"].strip(",")
elif classification[0:4] == "菜品口味":
dic["口味"] = classification[5:]
elif classification[2:4] == "时间":
dic["制作时间"] = classification[5:]
elif classification[2:4] == "工艺":
dic["主要工艺"] = classification[5:]
elif classification[0:2] == "工艺":
dic["主要工艺"] = classification[3:]
elif classification[2:4] == "难度":
pass
else:
dic["食材"] = dic["食材"] + "," + classification
# step_info是菜谱的制作步骤的临时储存下面好整理
step_info1 = s.xpath('//div[@class="c_bz_neirong_b0ah"]/text()')
step_info2 = s.xpath('//div[@class="c_buzhou cbox"]/text()')
for i in range(len(step_info1)):
step_info1[i] = str(i + 1) + "." + step_info1[i]
for i in range(len(step_info2)):
step_info2[i] = step_info2[i].strip()
dic["做法"] = step_info1 + step_info2
dic["图片url"] = s.xpath('//div[@class="cbox c_img"]//img/@src')
total_info.append(dic)
print("done" + str(index))
index = index + 1
except:
print(dic["菜名"]+"error")
return total_info
#最终几百道菜的菜谱
#写入文件
def write_csv(menu_list):
try:
with open(r"E:\datapy\脏腑调理\自汗盗汗.csv", "w", newline="") as csvfile:
index = 1
fieldnames = ["菜名", "分类", "口味", "食材", "主要工艺", "制作时间", "做法", "图片url"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for menu in menu_list:
try:
writer.writerow(menu)
print("write" + str(index))
index = index + 1
except Exception as e:
print("文件操作异常,写入失败", e)
except:
print("文件操作异常,打开失败")
if __name__ == '__main__':
urls = []
for i in range(4):
time.sleep(1)
print("主函数睡" + str(i))
html = get_page(i+1)
urls = get_urls(html) + urls
m = get_info(urls)
write_csv(m)