File size: 4,862 Bytes
f4c443a
 
4d48a4c
f4c443a
4d48a4c
a29f782
48e84df
f4c443a
 
 
 
a29f782
f4c443a
 
 
 
 
6598f74
 
 
f4c443a
 
 
 
6598f74
 
 
f4c443a
 
 
 
 
 
a29f782
3b87708
a29f782
f4c443a
6598f74
f4c443a
6598f74
 
 
 
f4c443a
 
6598f74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4c443a
 
 
 
 
 
 
6598f74
f4c443a
 
 
 
 
 
 
 
 
 
 
4d48a4c
48e84df
6598f74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48e84df
 
 
 
 
 
4d48a4c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import hashlib
from typing import Dict

from tavily import TavilyClient

from config.settings import REGULATORY_SOURCES, SOURCE_FULL_NAMES, TAVILY_API_KEY
from tools.llm import call_llm

# Initialize Tavily client
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)


class WebTools:
    def __init__(self):
        self.cached_searches = {}

    def generate_cache_key(self, industry: str, region: str, keywords: str) -> str:
        """
        Generate a unique cache key based on industry, region, and keywords.
        """
        key = f"{industry}:{region}:{keywords}".lower()
        return hashlib.md5(key.encode()).hexdigest()

    def crawl_regulatory_sites(self, industry: str, region: str, keywords: str) -> Dict:
        """
        Crawl regulatory websites for updates.
        """
        cache_key = self.generate_cache_key(industry, region, keywords)
        if cache_key in self.cached_searches:
            return self.cached_searches[cache_key]

        urls_to_crawl = REGULATORY_SOURCES.get(region, REGULATORY_SOURCES["US"])
        all_results = []
        crawl_instructions = (
            f"Recent {industry} {region} regulatory updates: {keywords}, 30 days"
        )

        # Crawl regulatory sites (limit to 3 sources)
        for source_name, url in list(urls_to_crawl.items())[:3]:
            crawl_results = self._get_crawl_results(
                source_name, url, crawl_instructions
            )
            all_results.extend(crawl_results)

        # General search
        search_results = self._get_search_results(industry, region, keywords)
        all_results.extend(search_results)

        results = {"results": all_results, "total_found": len(all_results)}
        self.cached_searches[cache_key] = results
        return results

    def _get_crawl_results(self, source_name: str, url: str, instructions: str) -> list:
        """
        Crawl a single regulatory source and return formatted results.
        """
        results = []
        try:
            crawl_response = tavily_client.crawl(
                url=url, max_depth=2, limit=5, instructions=instructions
            )
            for result in crawl_response.get("results", []):
                title = result.get("title")
                if not title or title == "No Title...":
                    title = SOURCE_FULL_NAMES.get(source_name, source_name)
                results.append(
                    {
                        "source": source_name,
                        "url": result.get("url", url),
                        "title": title,
                        "content": result.get("raw_content", "")[:1500],
                    }
                )
        except Exception as e:
            print(f"Crawl error for {source_name}: {e}")
        return results

    def _get_search_results(self, industry: str, region: str, keywords: str) -> list:
        """
        Perform a general web search and return formatted results.
        """
        results = []
        try:
            search_results = tavily_client.search(
                query=f"{industry} {region} regulatory updates compliance {keywords} 2024 2025",
                max_results=5,
                include_raw_content=True,
            )
            for result in search_results.get("results", []):
                results.append(
                    {
                        "source": "Web Search",
                        "url": result.get("url", ""),
                        "title": result.get("title", ""),
                        "content": result.get("content", ""),
                    }
                )
        except Exception as e:
            print(f"Search error: {e}")
        return results

    def extract_parameters(self, message: str) -> Dict:
        """
        Extract industry, region, and keywords from the query using LLM (no function calling).
        """
        prompt = (
            """
            Extract the following information from the user query below and return ONLY a valid JSON object with keys: industry, region, keywords.
            - industry: The industry mentioned or implied (e.g., fintech, healthcare, energy, general).
            - region: The region or country explicitly mentioned (e.g., US, EU, UK, Asia, Global).
            - keywords: The most important regulatory topics or terms, separated by commas. Do NOT include generic words or verbs.
            
            User query: {message}
            
            Example output:
            {{"industry": "fintech", "region": "US", "keywords": "SEC regulations"}}
            """
        ).replace("{message}", message)

        import json

        response = call_llm(prompt)
        try:
            params = json.loads(response)
        except Exception:
            params = {"industry": "", "region": "", "keywords": ""}
        return params