Spaces:

kingabzpro
/

RegRadar

Sleeping

File size: 4,862 Bytes

import hashlib
from typing import Dict

from tavily import TavilyClient

from config.settings import REGULATORY_SOURCES, SOURCE_FULL_NAMES, TAVILY_API_KEY
from tools.llm import call_llm

# Initialize Tavily client
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)


class WebTools:
    def __init__(self):
        self.cached_searches = {}

    def generate_cache_key(self, industry: str, region: str, keywords: str) -> str:
        """
        Generate a unique cache key based on industry, region, and keywords.
        """
        key = f"{industry}:{region}:{keywords}".lower()
        return hashlib.md5(key.encode()).hexdigest()

    def crawl_regulatory_sites(self, industry: str, region: str, keywords: str) -> Dict:
        """
        Crawl regulatory websites for updates.
        """
        cache_key = self.generate_cache_key(industry, region, keywords)
        if cache_key in self.cached_searches:
            return self.cached_searches[cache_key]

        urls_to_crawl = REGULATORY_SOURCES.get(region, REGULATORY_SOURCES["US"])
        all_results = []
        crawl_instructions = (
            f"Recent {industry} {region} regulatory updates: {keywords}, 30 days"
        )

        # Crawl regulatory sites (limit to 3 sources)
        for source_name, url in list(urls_to_crawl.items())[:3]:
            crawl_results = self._get_crawl_results(
                source_name, url, crawl_instructions
            )
            all_results.extend(crawl_results)

        # General search
        search_results = self._get_search_results(industry, region, keywords)
        all_results.extend(search_results)

        results = {"results": all_results, "total_found": len(all_results)}
        self.cached_searches[cache_key] = results
        return results

    def _get_crawl_results(self, source_name: str, url: str, instructions: str) -> list:
        """
        Crawl a single regulatory source and return formatted results.
        """
        results = []
        try:
            crawl_response = tavily_client.crawl(
                url=url, max_depth=2, limit=5, instructions=instructions
            )
            for result in crawl_response.get("results", []):
                title = result.get("title")
                if not title or title == "No Title...":
                    title = SOURCE_FULL_NAMES.get(source_name, source_name)
                results.append(
                    {
                        "source": source_name,
                        "url": result.get("url", url),
                        "title": title,
                        "content": result.get("raw_content", "")[:1500],
                    }
                )
        except Exception as e:
            print(f"Crawl error for {source_name}: {e}")
        return results

    def _get_search_results(self, industry: str, region: str, keywords: str) -> list:
        """
        Perform a general web search and return formatted results.
        """
        results = []
        try:
            search_results = tavily_client.search(
                query=f"{industry} {region} regulatory updates compliance {keywords} 2024 2025",
                max_results=5,
                include_raw_content=True,
            )
            for result in search_results.get("results", []):
                results.append(
                    {
                        "source": "Web Search",
                        "url": result.get("url", ""),
                        "title": result.get("title", ""),
                        "content": result.get("content", ""),
                    }
                )
        except Exception as e:
            print(f"Search error: {e}")
        return results

    def extract_parameters(self, message: str) -> Dict:
        """
        Extract industry, region, and keywords from the query using LLM (no function calling).
        """
        prompt = (
            """
            Extract the following information from the user query below and return ONLY a valid JSON object with keys: industry, region, keywords.
            - industry: The industry mentioned or implied (e.g., fintech, healthcare, energy, general).
            - region: The region or country explicitly mentioned (e.g., US, EU, UK, Asia, Global).
            - keywords: The most important regulatory topics or terms, separated by commas. Do NOT include generic words or verbs.
            
            User query: {message}
            
            Example output:
            {{"industry": "fintech", "region": "US", "keywords": "SEC regulations"}}
            """
        ).replace("{message}", message)

        import json

        response = call_llm(prompt)
        try:
            params = json.loads(response)
        except Exception:
            params = {"industry": "", "region": "", "keywords": ""}
        return params