94 lines
2.9 KiB
Python
94 lines
2.9 KiB
Python
from functions import send_request, get_selector
|
||
import csv
|
||
|
||
base_url = "http://www.39yst.com"
|
||
list_url = "http://www.39yst.com/changshi/list_24_"
|
||
|
||
def get_url(page_url):
|
||
"""
|
||
该方法返回当前界面的所有url
|
||
:param page_url: 某一页的网址
|
||
:return:
|
||
"""
|
||
try:
|
||
s = get_selector.get_selector(send_request.send_requests(page_url))
|
||
urls = s.xpath('//div[@class="item_all"]//p[@class="item_tit"]//a/@href')
|
||
return urls
|
||
except Exception as e:
|
||
print(e)
|
||
return []
|
||
|
||
def get_info(url):
|
||
"""
|
||
获取段落文字信息
|
||
(这里有个单词记错了,comment应该是content,不过问题应该不大)
|
||
:param url: 详情界面url
|
||
:return:
|
||
"""
|
||
try:
|
||
s = get_selector.get_selector(send_request.send_requests(url))
|
||
comment = s.xpath('//div[@id="articleContent"]/p/following-sibling::*/text()')
|
||
title = s.xpath('//h1/text()')[0]
|
||
for s in range(len(comment)):
|
||
comment[s] = comment[s].strip()
|
||
comment[s] = comment[s].lstrip('。')
|
||
comment[s] = comment[s].lstrip('(')
|
||
comment[s] = comment[s].lstrip('?')
|
||
comment[s] = comment[s].lstrip(')')
|
||
comment[s] = comment[s].lstrip('!')
|
||
comment[s] = comment[s].lstrip(':')
|
||
comment[s] = comment[s].lstrip('、')
|
||
comment[s] = comment[s].lstrip(',')
|
||
comment[s] = comment[s].lstrip('》')
|
||
comment[s] = comment[s].lstrip('《')
|
||
comment[s] = comment[s].lstrip(':')
|
||
for i in comment:
|
||
if i is '':
|
||
comment.remove(i)
|
||
if len(i) is 1:
|
||
comment.remove(i)
|
||
try:
|
||
index = comment.index("本篇文章版权归民福康健康所有,未经许可,谢绝转载。")
|
||
s = "\n".join(comment[:index])
|
||
except:
|
||
s = "\n".join(comment)
|
||
print("正在获取", comment[0][:5], "...")
|
||
dic = {}
|
||
dic['title'] = title
|
||
dic['content'] = s
|
||
return dic
|
||
except Exception as e:
|
||
print(e)
|
||
return ''
|
||
|
||
def write_csv(path, dics):
|
||
"""
|
||
把内容写入
|
||
:param path:储存位置
|
||
:param dics:信息字典列表
|
||
:return:
|
||
"""
|
||
with open(path, 'w', newline='') as file:
|
||
fieldnames = ['title', "content"]
|
||
writer = csv.DictWriter(file, fieldnames)
|
||
writer.writeheader()
|
||
for dic in dics:
|
||
try:
|
||
writer.writerow(dic)
|
||
except:
|
||
pass
|
||
|
||
if __name__ == "__main__":
|
||
info_list = []
|
||
for i in range(20):
|
||
try:
|
||
page_url = list_url + "{page}.shtml".format(page=i + 1)
|
||
info_urls = get_url(page_url)
|
||
for info_url in info_urls:
|
||
info_list.append(get_info(info_url))
|
||
except:
|
||
pass
|
||
write_csv(path=r"E:\datapy\trick.csv", dics=info_list)
|
||
|
||
|