searxng/searx/engines/acfun.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Acfun search engine for searxng"""

from urllib.parse import urlencode
import re
import json
from datetime import datetime, timedelta
from lxml import html

from searx.utils import extract_text

# Metadata
about = {
    "website": "https://www.acfun.cn/",
    "wikidata_id": "Q3077675",
    "use_official_api": False,
    "require_api_key": False,
    "results": "HTML",
    "language": "zh",
}

# Engine Configuration
categories = ["videos"]
paging = True

# Base URL
base_url = "https://www.acfun.cn"


def request(query, params):
    query_params = {"keyword": query, "pCursor": params["pageno"]}
    params["url"] = f"{base_url}/search?{urlencode(query_params)}"
    return params


def response(resp):
    results = []

    matches = re.findall(r'bigPipe\.onPageletArrive\((\{.*?\})\);', resp.text, re.DOTALL)
    if not matches:
        return results

    for match in matches:
        try:
            json_data = json.loads(match)
            raw_html = json_data.get("html", "")
            if not raw_html:
                continue

            tree = html.fromstring(raw_html)

            video_blocks = tree.xpath('//div[contains(@class, "search-video")]')
            if not video_blocks:
                continue

            for video_block in video_blocks:
                video_info = extract_video_data(video_block)
                if video_info and video_info["title"] and video_info["url"]:
                    results.append(video_info)

        except json.JSONDecodeError:
            continue

    return results


def extract_video_data(video_block):
    try:
        data_exposure_log = video_block.get('data-exposure-log')
        video_data = json.loads(data_exposure_log)

        content_id = video_data.get("content_id", "")
        title = video_data.get("title", "")

        url = f"{base_url}/v/ac{content_id}"
        iframe_src = f"{base_url}/player/ac{content_id}"

        create_time = extract_text(video_block.xpath('.//span[contains(@class, "info__create-time")]'))
        video_cover = extract_text(video_block.xpath('.//div[contains(@class, "video__cover")]/a/img/@src')[0])
        video_duration = extract_text(video_block.xpath('.//span[contains(@class, "video__duration")]'))
        video_intro = extract_text(video_block.xpath('.//div[contains(@class, "video__main__intro")]'))

        published_date = None
        if create_time:
            try:
                published_date = datetime.strptime(create_time.strip(), "%Y-%m-%d")
            except (ValueError, TypeError):
                pass

        length = None
        if video_duration:
            try:
                timediff = datetime.strptime(video_duration.strip(), "%M:%S")
                length = timedelta(minutes=timediff.minute, seconds=timediff.second)
            except (ValueError, TypeError):
                pass

        return {
            "title": title,
            "url": url,
            "content": video_intro,
            "thumbnail": video_cover,
            "length": length,
            "publishedDate": published_date,
            "iframe_src": iframe_src,
        }

    except (json.JSONDecodeError, AttributeError, TypeError, ValueError):
        return None