站长素材网的网页模板采集
简介
意外发现的网站,想起之前学习flask时无练习模板的烦恼,找模板时不是要钱就是要登录,想自己写又写不出来😃(css学了跟没学一样),最后也是在github上找到一些可用的来练习flask,现在也只记得个大概了,要重新学习了。
这个网站全免费还不需要登录,简直太爱了。东西也还挺多的,网页格式也基本相同,只需要简单改改规则就行。
考虑到可能有朋友有下载需求,我也是写了异步下载的脚本。
小建议:我每次在写完代码都会复制给gpt工具让其帮忙优化和排错,效果也是很不错的
代码实现
"""
@ 😀Author : 🎈
@ ⏲️Time : 2024年02月24
@ 📄File : denglu_moban.py
@ ℹ️Description:
https://sc.chinaz.com/tag_moban/denglu.html
https://sc.chinaz.com/tag_moban/denglu_2.html
站长素材网下的登录模板
"""
import logging
from pathlib import Path
from urllib.parse import urljoin
import json
import requests
from lxml import etree
TOTAL_PAGES = 2
BASE_URL = 'https://sc.chinaz.com/'
JSON_FILE_NAME = 'xiangyingshi.json'
def set_log() -> None:
"""日志记录,在程序开始前调用即可"""
# 获取当前脚本的路径
file_path = Path(__file__)
# 获取当前脚本所在目录的父目录
log_folder_path = file_path.parent / 'log'
log_folder_path.mkdir(exist_ok=True)
filename = f'{file_path.stem}-script1.log'
log_file_path = log_folder_path / filename
# 设置日志输出格式
formatter = logging.Formatter(fmt='%(asctime)s\t[%(levelname)s]\t[%(name)s]\t[%(threadName)s]\t[Line: %(lineno)d]\t%(message)s',
datefmt='%Y-%m-%d %H:%M:%S', )
# 创建一个日志处理器用于写入文件
file_handler = logging.FileHandler(log_file_path, mode='w', encoding='utf-8')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
# 创建一个日志处理器用于输出到控制台
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)
# 获取根日志记录器并为其添加上述两个处理器
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(file_handler)
logger.addHandler(console_handler)
def get_page(url):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
# "Cookie": "",
"Pragma": "no-cache",
"Referer": "https://sc.chinaz.com/moban/",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'utf-8'
logging.info('Successful %d' % response.status_code)
return response.text
except requests.exceptions.RequestException:
logging.exception('Error making GET request to %s' % url)
def get_all_page(url, pages):
page_data = []
for page in range(1, pages+1): # 由1开始会出现404
if page == 1:
url_page = url
else:
url_page = f'https://sc.chinaz.com/tag_moban/xiangyingshi_{page}.html'
logging.info('Start url : %s' % url_page)
html_data = get_page(url_page)
data = parse_page(html_data)
page_data.extend(data)
return page_data
def save_json(data):
"""保存数据到文件"""
with open(JSON_FILE_NAME, 'w', encoding='utf-8')as f:
json.dump(data, f, indent=4, ensure_ascii=False)
def parse_page(html):
doc = etree.HTML(html)
item_box = doc.xpath('//*[@id="container"]/div')
data_list = []
for item in item_box:
title = item.xpath('./p/a/text()')[0]
href = item.xpath('./div/a/@href')[0]
href = urljoin('https://sc.chinaz.com', href)
img = item.xpath('./div/a/img/@src')[0]
img = urljoin('https://', img)
dic = {
'title': title,
'img': img,
'child_url': href,
}
logging.info('Parsed item: %s' % dic)
data_list.append(dic)
return data_list
def parse_child_page(data): # json_data
for index, info in enumerate(data): # 通过索引来指定download_url的位置
child_url = info['child_url']
logging.info('Start child page: %s' % child_url)
child_html = get_page(child_url)
doc = etree.HTML(child_html)
download_url = doc.xpath('//div[@class="downbody"]/div[3]/a[3]/@href')[0]
data[index]['download_url'] = download_url
logging.info('get download url : %s' % download_url)
save_json(data)
def main():
"""使用方法:1.复制链如: https://sc.chinaz.com/tag_moban/jianli.html 到page_url
2. 修改82行的链接
3. 修改JSON_FILE_NAME的值
只要是tag_moban下的都可以使用
"""
set_log()
page_url = 'https://sc.chinaz.com/tag_moban/xiangyingshi.html'
json_data = get_all_page(page_url, TOTAL_PAGES)
parse_child_page(json_data)
if __name__ == '__main__':
main()
运行效果:
下载脚本:
from pathlib import Path
import json
import asyncio
import aiohttp
import aiofiles
FOLDER_DIR = 'moban'
# 异步读取 JSON 数据
async def load_json_data(filepath):
async with aiofiles.open(filepath, 'r', encoding='utf-8') as f:
content = await f.read()
return json.loads(content)
async def scrape_page(url, session):
try:
async with session.get(url) as response:
if response.status == 200:
content = await response.content.read()
folder_path = Path(FOLDER_DIR)
folder_path.mkdir(exist_ok=True)
file_name = url.rsplit('/')[-1]
data_path = folder_path / file_name
async with aiofiles.open(data_path, mode='wb') as f:
await f.write(content)
print(f'正在保存:{file_name}')
else:
print(f'获取页面{url}-->状态码:{response.status}...')
except aiohttp.ClientError as err:
print(f'Error! {err}')
async def main(urls):
async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(scrape_page(url, session)) for url in urls]
await asyncio.gather(*tasks)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
# 确保异步读取 JSON 文件
json_data = loop.run_until_complete(load_json_data('denglu.json'))
url_list = [item.get('download_url', '') for item in json_data]
loop.run_until_complete(main(url_list))
运行结果:
项目也是上传到我的gitee了, 欢迎start
后面计划重新学习flask,学习笔记我也会分享出来😀