Python/爬虫工厂/main.py

426 lines
9.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
'''
脚本名称:
监控数据工厂
脚本说明:
根据监控任务初始化搜索任务,根据搜索任务初始化爬虫并启动
'''
#加载模块
from urllib.parse import quote
import time
import json
from selenium.webdriver import Chrome
#from undetected_chromedriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
sys.path.append('..')
from utils.request import Feishu
'''
'''
#根据监控任务初始化搜索任务
def Initialize_search() -> None:
print('根据监控任务初始化搜索任务...', end = '')
#列出记录(监控任务)
url = 'https://open.feishu.cn/open-apis/bitable/v1/apps/KOztbJC63aLF4qsaZK2cuGdXnob/tables/tblcVxN52hZKT3z1/records'
monitor_tasks = Feishu().request(url)
#检查获取监控任务是否成功
if monitor_tasks.code != 200:
print('获取监控任务失败,本次启动终止!')
print()
return None
monitor_tasks = monitor_tasks.data.get('items')
#检查监控任务是否为空
if monitor_tasks is None:
print('无监控任务,本次启动终止!')
print()
return None
#列出记录(搜索任务)
url = 'https://open.feishu.cn/open-apis/bitable/v1/apps/KOztbJC63aLF4qsaZK2cuGdXnob/tables/tblSsnhXhA24PIsy/records'
search_tasks = Feishu().request(url)
#检查获取搜索任务是否成功
if search_tasks.code != 200:
print('获取搜索任务失败,本次启动终止!')
print()
return None
search_tasks = search_tasks.data.get('items')
#检查搜索任务是否为空
if search_tasks is None:
comparators = []
else:
#仅保留监控任务标识、搜索渠道和关键词,用于与添加、删除的搜索任务比较
comparators = [{'监控任务标识': search_task.get('fields').get('监控任务标识'), '搜索渠道': search_task.get('fields').get('搜索渠道'), '关键词': search_task.get('fields').get('关键词')} for search_task in search_tasks]
#用于保存新增的搜索任务
records = []
#定义最近搜索时间为23-12-25 00:00:00毫秒级时间戳
last_search_time = int(time.mktime(time.strptime('23-12-25', '%y-%m-%d')) * 1000)
for monitor_task in monitor_tasks:
#解析监控任务标识
record_id = monitor_task.get('record_id')
#解析搜索渠道
search_channels = monitor_task.get('fields').get('搜索渠道')
#解析关键词(可能存在多个关键词,使用“,”区隔)
keywords = monitor_task.get('fields').get('关键词').split('')
#遍历搜索渠道和关键词生成新增的搜索任务
for search_channel in search_channels:
for keyword in keywords:
fields = {
'监控任务标识': record_id,
'搜索渠道': search_channel,
'关键词': keyword
}
if fields not in comparators:
#添加最多搜索页数、搜索周期、最近搜索时间和搜索状态(默认为停用)
fields.update({'最多搜索页数': 1, '搜索周期': 168, '最近搜索时间': last_search_time, '搜索状态': '停用'})
records.append(
{
'fields': fields
}
)
#检查新增的搜索任务是否为空
if records != []:
#新增多条记录
url = 'https://open.feishu.cn/open-apis/bitable/v1/apps/KOztbJC63aLF4qsaZK2cuGdXnob/tables/tblSsnhXhA24PIsy/records/batch_create'
response = Feishu().request(url, 'post', {'records': records})
#检查新增搜索任务是否成功
if response.code != 200:
print('新增搜索任务失败,本次启动终止!')
print()
return None
#检查搜索任务是否非空
if search_tasks is not None:
#生成删除的搜索任务
records = [search_task.get('record_id') for search_task in search_tasks if search_task.get('fields').get('监控任务标识') is None]
if records != []:
#删除多条记录
url = 'https://open.feishu.cn/open-apis/bitable/v1/apps/KOztbJC63aLF4qsaZK2cuGdXnob/tables/tblSsnhXhA24PIsy/records/batch_delete'
response = Feishu().request(url, 'post', {'records': records})
#检查删除搜索任务是否成功
if response.code != 200:
print('删除搜索任务失败,本次启动终止!')
print()
return None
print('初始化成功。')
print()
#根据搜索任务启动爬虫
def Spider(search_task) -> list:
records = []
#搜索链接
search_url = search_task.get('search_url')
print('正在爬取 %s ...' % search_url, end = '')
try:
#初始化chrome
#chrome = Chrome(headless = False, use_subprocess = True)
chrome = Chrome()
chrome.get(search_url)
#搜索渠道
search_channel = search_task.get('search_channel')
match search_channel:
case '百度':
#定位元素
elements = WebDriverWait(chrome, 30).until(expected_conditions.presence_of_all_elements_located((By.XPATH, '//h3[contains(@class,"c-title t t tts-title")]/a')))
case '搜狗':
elements = WebDriverWait(chrome, 30).until(expected_conditions.presence_of_all_elements_located((By.XPATH, '//h3[contains(@class,"vr-title")]/a')))
case '360':
elements = WebDriverWait(chrome, 30).until(expected_conditions.presence_of_all_elements_located((By.XPATH, '//h3[contains(@class,"res-title")]/a')))
#解析监控对象
monitor_object = search_task.get('monitor_object')
#解析关键词
keyword = search_task.get('keyword')
#定义搜索时间
search_time = int(time.time() * 1000)
for element in elements:
records.append(
{
'monitor_object': monitor_object,
'search_channel': search_channel,
'keyword': keyword,
'search_url': search_url,
'fields':
{
#搜索结果标题
'title': element.get_attribute('outerText'),
#搜索结果连接
'hyperlink': element.get_attribute('href')
},
'search_time': search_time
}
)
print('已完成。')
print()
except:
print('爬取失败!')
print()
finally:
try:
chrome.quit()
except:
pass
return records
#根据搜索任务初始化爬取任务
def Initialize_crawl() -> None:
print('根据搜索任务初始化爬取任务...', end = '')
#列出记录(搜索状态为「启用」的搜索任务)
url = 'https://open.feishu.cn/open-apis/bitable/v1/apps/KOztbJC63aLF4qsaZK2cuGdXnob/tables/tblSsnhXhA24PIsy/records?filter={}'.format(quote('CurrentValue.[搜索状态]="启用"'))
search_tasks = Feishu().request(url)
#检查获取启用的搜索任务是否成功
if search_tasks.code != 200:
print('获取启用的搜索任务失败,本次启动终止!')
print()
return None
search_tasks = search_tasks.data.get('items')
#检查启用的搜索任务是否为空
if search_tasks is None:
print('无启用的搜索任务,本次启动终止!')
print()
return None
#用于记录爬取任务
crawl_tasks = []
for search_task in search_tasks:
#解析最近搜索时间
last_search_time = search_task.get('fields').get('最近搜索时间')
#解析搜索周期
search_period = int(search_task.get('fields').get('搜索周期')
#若最近搜索时间距现在的小时数超过搜索周期则生成爬取任务
if (int(time.time() * 1000) - last_search_time) / 3600000 >= search_period):
#解析搜索任务标识
record_id = search_task.get('record_id')
#解析搜索渠道
search_channel = search_task.get('fields').get('搜索渠道')
#解析关键词
keyword = search_task.get('fields').get('关键词')
#解析最多搜索页数
maximun_pages = int(search_task.get('fields').get('最多搜索页数'))
#根据最多搜索页数生成爬取任务
for page in range(maximun_pages):
#仅考虑通过链接爬取数据
match search_channel:
case '百度':
url = 'https://www.baidu.com/s?wd={}&ie=utf-8&pn={}'.format(quote(keyword), page * 10)
xpath = '//h3[contains(@class,"c-title t t tts-title")]/a'
case '搜狗':
#ie为关键词编码
url = 'https://www.sogou.com/web?query={}&page={}&ie=utf8'.format(quote(keyword), page + 1)
xpath = '//h3[contains(@class,"vr-title")]/a'
case '360':
url = 'https://www.so.com/s?q={}&pn={}'.format(quote(keyword), page + 1)
xpath = '//h3[contains(@class,"res-title")]/a'
crawl_tasks.append(
{
'monitor_task':
{
'record_id':
},
'search_task':
{
'search_channel': search_channel,
'keyword': keyword,
'search_url': search_url,
}
}
)
print('初始化成功。')
print()
print('启动爬虫... \\')
print()
#创建线程池
with ThreadPoolExecutor(max_workers = 2) as executor:
threads = []
for search_task in search_tasks:
thread = executor.submit(Spider, search_task)
#将搜索任务提交至进程池
threads.append(thread)
results = []
for thread in as_completed(threads):
result = thread.result()
#若爬虫返回非空则添加至结果
if result != []:
results.extend(result)
print(results)
print(Initialize_crawl())