RegRadar / tools /web_tools.py
Abid Ali Awan
Enhance app.py to implement a Gradio interface for the RegRadar AI Regulatory Compliance Assistant, featuring a chatbot, example queries, and a tool information panel. Refactor UIHandler to streamline chat processing and improve regulatory query handling. Update web_tools.py to modularize crawling and searching functionalities, enhancing code readability and maintainability.
6598f74
import hashlib
from typing import Dict
from tavily import TavilyClient
from config.settings import REGULATORY_SOURCES, SOURCE_FULL_NAMES, TAVILY_API_KEY
from tools.llm import call_llm
# Initialize Tavily client
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
class WebTools:
def __init__(self):
self.cached_searches = {}
def generate_cache_key(self, industry: str, region: str, keywords: str) -> str:
"""
Generate a unique cache key based on industry, region, and keywords.
"""
key = f"{industry}:{region}:{keywords}".lower()
return hashlib.md5(key.encode()).hexdigest()
def crawl_regulatory_sites(self, industry: str, region: str, keywords: str) -> Dict:
"""
Crawl regulatory websites for updates.
"""
cache_key = self.generate_cache_key(industry, region, keywords)
if cache_key in self.cached_searches:
return self.cached_searches[cache_key]
urls_to_crawl = REGULATORY_SOURCES.get(region, REGULATORY_SOURCES["US"])
all_results = []
crawl_instructions = (
f"Recent {industry} {region} regulatory updates: {keywords}, 30 days"
)
# Crawl regulatory sites (limit to 3 sources)
for source_name, url in list(urls_to_crawl.items())[:3]:
crawl_results = self._get_crawl_results(
source_name, url, crawl_instructions
)
all_results.extend(crawl_results)
# General search
search_results = self._get_search_results(industry, region, keywords)
all_results.extend(search_results)
results = {"results": all_results, "total_found": len(all_results)}
self.cached_searches[cache_key] = results
return results
def _get_crawl_results(self, source_name: str, url: str, instructions: str) -> list:
"""
Crawl a single regulatory source and return formatted results.
"""
results = []
try:
crawl_response = tavily_client.crawl(
url=url, max_depth=2, limit=5, instructions=instructions
)
for result in crawl_response.get("results", []):
title = result.get("title")
if not title or title == "No Title...":
title = SOURCE_FULL_NAMES.get(source_name, source_name)
results.append(
{
"source": source_name,
"url": result.get("url", url),
"title": title,
"content": result.get("raw_content", "")[:1500],
}
)
except Exception as e:
print(f"Crawl error for {source_name}: {e}")
return results
def _get_search_results(self, industry: str, region: str, keywords: str) -> list:
"""
Perform a general web search and return formatted results.
"""
results = []
try:
search_results = tavily_client.search(
query=f"{industry} {region} regulatory updates compliance {keywords} 2024 2025",
max_results=5,
include_raw_content=True,
)
for result in search_results.get("results", []):
results.append(
{
"source": "Web Search",
"url": result.get("url", ""),
"title": result.get("title", ""),
"content": result.get("content", ""),
}
)
except Exception as e:
print(f"Search error: {e}")
return results
def extract_parameters(self, message: str) -> Dict:
"""
Extract industry, region, and keywords from the query using LLM (no function calling).
"""
prompt = (
"""
Extract the following information from the user query below and return ONLY a valid JSON object with keys: industry, region, keywords.
- industry: The industry mentioned or implied (e.g., fintech, healthcare, energy, general).
- region: The region or country explicitly mentioned (e.g., US, EU, UK, Asia, Global).
- keywords: The most important regulatory topics or terms, separated by commas. Do NOT include generic words or verbs.
User query: {message}
Example output:
{{"industry": "fintech", "region": "US", "keywords": "SEC regulations"}}
"""
).replace("{message}", message)
import json
response = call_llm(prompt)
try:
params = json.loads(response)
except Exception:
params = {"industry": "", "region": "", "keywords": ""}
return params