XPath规则
本项目基于video-hub
的简化版本,过 XPath 解析网站内容,用户仅需编写 XPath 规则即可实现小组件的数据抓取。
小组件目录结构
小组件文件夹应包含以下内容:
xpath-widget.zip
├── core // xpath-core代码 可到github下载
├── config.js // xpaht配置文件
├── ext.js // [可选] 扩展js文件
├── widget.json
├── widget.js
└── icon.png
文件说明
core
:xpath-core 代码 github下载。config.js
:XPath 规则配置文件。ext.js
(可选):扩展 JavaScript 逻辑文件。
内置关键字模版
执行时,以下关键字会被动态替换:
{base_url}
: 若 config.js 配置了 base_url,该字段会被替换成实际 URL。{type_id}
:home
方法的type_id
。{page}
:页码,应用会自动累加。{wd}
:搜索接口的关键词。{index}
:*_node
相关节点,如果存在父节点的包含关系,遍历时会替换为对应的索引。
规则说明
{
// [可选] core封装的是通用逻辑,如果不满足,可以通过ext.js来扩展js逻辑
"ext": "./ext.js",
// [可选] 配置小组件user-agent
"ua": "",
// [可选] 全局base_url,配置了可替换下面出现的{base_url}
"base_url": "https://gimy.tv/",
// [可选] 如果设计到多个domain,需要动态来确定base_url可以通过自定义js来处理,依赖ext.js
"base_url_function": "getBaseUrl",
// 对应home方法
// home方法请求地址
"homeUrl": "{base_url}",
// [可选] 是想自定义分类,例如手动配置分类名字, 或者指定对应分类列表的请求地址(list_url)
"home_cate_manual": {
"latest-updates": {
"url": "{base_url}/{type_id}/?mode=async&function=get_block&block_id=list_videos_latest_videos_list&sort_by=post_date&from={page}"
},
"new-release": {
// 手动配置type_id为new-release的时候,type_name的值
"name": "最新发布",
// 手动配置type_id为new-release的时候,list_url的值
"url": "{base_url}/{type_id}/?mode=async&function=get_block&block_id=list_videos_common_videos_list&sort_by=release_year&from={page}"
},
// 通配符,表示为定义的类型,全list_url为设定的值
"*": {
"url": "{base_url}/{type_id}/?mode=async&function=get_block&block_id=list_videos_common_videos_list&sort_by=video_viewed_week&from={page}"
}
},
// 分类节点
"home_cate_node": "//div[contains(@class, 'container')]/div/div[contains(@class, 'myui-panel')]",
// 分类名字
"home_cate_name": ".//div[contains(@class, 'myui-panel__head')]/h3[contains(@class, 'title')]//text()",
// 分类id
"home_cate_id": ".//div[contains(@class, 'myui-panel__head')]//a[contains(@class, 'more')]//@href",
// [可选] 分类id正则提取
"home_cate_id_regexp": "/browse/(\\d+).html",
// 资源节点
"home_vod_node": ".//div[contains(@class,'myui-vodlist__box')]",
// vod_name
"home_vod_name": "./a/@title",
// vod_id
"home_vod_id": "./a/@href",
// [可选] vod_id正则提取
"home_vod_id_regexp": "/vod/(\\d+).html",
// vod_pic
"home_vod_pic": "./a/@data-original",
// vod_remarks
"home_vod_remarks": "./a/span[contains(@class, 'pic-text')]//text()",
// list方法
// 分类页地址 {base_url} base_url {type_id} 分类id {page} 当前页
"list_url": "{base_url}/genre/{type_id}---{page}.html",
// 同上面的homeVod字段 分类列表中的视频信息
"list_vod_node": "//ul[contains(@class, 'myui-vodlist')]/li",
"list_vod_name": ".//a[contains(@class, 'thumb')]/@title",
"list_vod_id": ".//a[contains(@class, 'thumb')]/@href",
"list_vod_id_regexp": "/vod/(\\d+).html",
"list_vod_pic": ".//a[contains(@class, 'thumb')]/@data-original",
"list_vod_remarks": ".//a[contains(@class, 'thumb')]/span[contains(@class, 'pic-text')]//text()",
// 分页信息,获取分页节点
"list_page_node": "//ul[contains(@class, 'myui-page')]",
// 获取最大分页,即pages, app会根据pages来动态替换{page}的内容
"list_page_id": "./a[last()]/@href",
// [可选] page_id正则提取
"list_page_id_regexp": "(?=---(\\d+)\\.html$)",
// 对应detail方法
// 视频详情地址 {base_url} base_url {vod_id} 视频id
"detail_url": "{base_url}/vod/{vod_id}.html",
// VodDetail 参考: https://agcplayer.com/dev/models/vod_detail.html#voddetail
"detail_vod_name": "//div[contains(@class, 'myui-content__detail')]/h1[contains(@class, 'title')]/text()",
"detail_vod_pic": "//div[contains(@class, 'myui-content__thumb')]/a/img/@data-original",
"detail_vod_content": "//div[contains(@id, 'desc')]//div[contains(@class, 'content')]/p/text()",
"detail_vod_actor": "//div[contains(@class, 'myui-content__detail')]/p[contains(@class, 'data')][2]/a/text()",
"detail_vod_year": "//div[contains(@class, 'myui-content__detail')]/p[contains(@class, 'data')][1]/a[2]/text()",
"detail_vod_time": "//div[contains(@class, 'myui-content__detail')]/p[contains(@class, 'data')][5]/text()",
"detail_vod_class": "//span[@class='text-muted' and contains(text(),'分类')]/following-sibling::*[1]/text()",
"detail_vod_area": "",
"detail_vod_lang": "",
"detail_vod_remarks": "//div[contains(@class, 'myui-content__detail')]/p[contains(@class, 'data')][4]/text()",
// 播放源节点
"detail_source_node": "//div[contains(@class, 'row')][2]/div/div[contains(@class, 'myui-panel') and contains(@class, 'myui-panel-bg')]",
// 播放源名字
"detail_source_name": ".//div[contains(@class, 'myui-panel__head')]/h3/text()",
// 播放分集节点
"detail_url_node": ".//ul[contains(@class,'myui-content__list')]/li",
// 播放分集id
"detail_url_id": "./a/@href",
//[可选] 播放分集id正则匹配
"detail_url_id_regexp": "",
// 播放分集名字
"detail_url_name": "./a/text()",
// 相关推荐资源节点, 同上面的homeVod字段 分类列表中的视频信息
"similar_node": "//ul[contains(@id, 'type')]/li",
"similar_vod_name": ".//a[contains(@class, 'myui-vodlist__thumb')]/@title",
"similar_vod_id": ".//a[contains(@class, 'myui-vodlist__thumb')]/@href",
"similar_vod_id_regexp": "/vod/(\\d+).html",
"similar_vod_pic": ".//a[contains(@class, 'myui-vodlist__thumb')]/@style",
"similar_vod_pic_regexp": "url\\(\\s*['\"]?(.*?)['\"]?\\s*\\)",
"similar_vod_remarks": ".//a[contains(@class, 'myui-vodlist__thumb')]/span[contains(@class, 'pic-text')]//text()",
// search 方法
// 搜索地址 {base_url} base_url {wd} 搜索关键字 {page} 当前页
"search_url": "{base_url}/search/{wd}----------{page}---.html",
// 同上面的listVod字段 分类列表中的视频信息
"search_vod_node": "//ul[contains(@class, 'myui-vodlist')]/li",
"search_vod_name": ".//a[contains(@class, 'thumb')]/@title",
"search_vod_id": ".//a[contains(@class, 'thumb')]/@href",
"search_vod_id_regexp": "/vod/(\\d+).html",
"search_vod_pic": ".//a[contains(@class, 'thumb')]/@data-original",
"search_vod_remarks": ".//a[contains(@class, 'thumb')]/span[contains(@class, 'pic-text')]//text()",
"search_page_node": "//ul[contains(@class, 'myui-page')]",
"search_page_id": "./li[last()]/a/@href",
"search_page_id_regexp": "(?=---(\\d+)\\.html$)",
// play 方法
// [可选] 如需要解析最终播放地址,需要通过ext.js扩展方法
"play_function": "getPlayUrl",
}
ext.js 说明
内置变量
$fetch
fetch 封装,配置 uaDOMParser
解析 html 为 domxpath
xpath 处理模块CryptoJS
CryptoJS
使用示例:
const response = await $fetch(`https://gimy.tv/`);
const html = await response.text();
const { document } = DOMParser(html);
const $script = xpath(document, [
"//div[contains(@class, 'myui-player__video')]/script"
])[0];