Spaces:
Sleeping
Sleeping
File size: 4,862 Bytes
f4c443a 4d48a4c f4c443a 4d48a4c a29f782 48e84df f4c443a a29f782 f4c443a 6598f74 f4c443a 6598f74 f4c443a a29f782 3b87708 a29f782 f4c443a 6598f74 f4c443a 6598f74 f4c443a 6598f74 f4c443a 6598f74 f4c443a 4d48a4c 48e84df 6598f74 48e84df 4d48a4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import hashlib
from typing import Dict
from tavily import TavilyClient
from config.settings import REGULATORY_SOURCES, SOURCE_FULL_NAMES, TAVILY_API_KEY
from tools.llm import call_llm
# Initialize Tavily client
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
class WebTools:
def __init__(self):
self.cached_searches = {}
def generate_cache_key(self, industry: str, region: str, keywords: str) -> str:
"""
Generate a unique cache key based on industry, region, and keywords.
"""
key = f"{industry}:{region}:{keywords}".lower()
return hashlib.md5(key.encode()).hexdigest()
def crawl_regulatory_sites(self, industry: str, region: str, keywords: str) -> Dict:
"""
Crawl regulatory websites for updates.
"""
cache_key = self.generate_cache_key(industry, region, keywords)
if cache_key in self.cached_searches:
return self.cached_searches[cache_key]
urls_to_crawl = REGULATORY_SOURCES.get(region, REGULATORY_SOURCES["US"])
all_results = []
crawl_instructions = (
f"Recent {industry} {region} regulatory updates: {keywords}, 30 days"
)
# Crawl regulatory sites (limit to 3 sources)
for source_name, url in list(urls_to_crawl.items())[:3]:
crawl_results = self._get_crawl_results(
source_name, url, crawl_instructions
)
all_results.extend(crawl_results)
# General search
search_results = self._get_search_results(industry, region, keywords)
all_results.extend(search_results)
results = {"results": all_results, "total_found": len(all_results)}
self.cached_searches[cache_key] = results
return results
def _get_crawl_results(self, source_name: str, url: str, instructions: str) -> list:
"""
Crawl a single regulatory source and return formatted results.
"""
results = []
try:
crawl_response = tavily_client.crawl(
url=url, max_depth=2, limit=5, instructions=instructions
)
for result in crawl_response.get("results", []):
title = result.get("title")
if not title or title == "No Title...":
title = SOURCE_FULL_NAMES.get(source_name, source_name)
results.append(
{
"source": source_name,
"url": result.get("url", url),
"title": title,
"content": result.get("raw_content", "")[:1500],
}
)
except Exception as e:
print(f"Crawl error for {source_name}: {e}")
return results
def _get_search_results(self, industry: str, region: str, keywords: str) -> list:
"""
Perform a general web search and return formatted results.
"""
results = []
try:
search_results = tavily_client.search(
query=f"{industry} {region} regulatory updates compliance {keywords} 2024 2025",
max_results=5,
include_raw_content=True,
)
for result in search_results.get("results", []):
results.append(
{
"source": "Web Search",
"url": result.get("url", ""),
"title": result.get("title", ""),
"content": result.get("content", ""),
}
)
except Exception as e:
print(f"Search error: {e}")
return results
def extract_parameters(self, message: str) -> Dict:
"""
Extract industry, region, and keywords from the query using LLM (no function calling).
"""
prompt = (
"""
Extract the following information from the user query below and return ONLY a valid JSON object with keys: industry, region, keywords.
- industry: The industry mentioned or implied (e.g., fintech, healthcare, energy, general).
- region: The region or country explicitly mentioned (e.g., US, EU, UK, Asia, Global).
- keywords: The most important regulatory topics or terms, separated by commas. Do NOT include generic words or verbs.
User query: {message}
Example output:
{{"industry": "fintech", "region": "US", "keywords": "SEC regulations"}}
"""
).replace("{message}", message)
import json
response = call_llm(prompt)
try:
params = json.loads(response)
except Exception:
params = {"industry": "", "region": "", "keywords": ""}
return params
|