chore: remove langchain in tools (#3247)

2024-04-09 19:28:22 +08:00
parent 2a6b7d57cb
commit e635f3dc1d
7 changed files with 645 additions and 29 deletions
--- a/api/core/tools/provider/builtin/pubmed/tools/pubmed_search.py
+++ b/api/core/tools/provider/builtin/pubmed/tools/pubmed_search.py
@@ -1,16 +1,187 @@
+import json
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
 from typing import Any

-from langchain.tools import PubmedQueryRun
 from pydantic import BaseModel, Field

 from core.tools.entities.tool_entities import ToolInvokeMessage
 from core.tools.tool.builtin_tool import BuiltinTool


+class PubMedAPIWrapper(BaseModel):
+    """
+    Wrapper around PubMed API.
+
+    This wrapper will use the PubMed API to conduct searches and fetch
+    document summaries. By default, it will return the document summaries
+    of the top-k results of an input search.
+
+    Parameters:
+        top_k_results: number of the top-scored document used for the PubMed tool
+        load_max_docs: a limit to the number of loaded documents
+        load_all_available_meta:
+          if True: the `metadata` of the loaded Documents gets all available meta info
+            (see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch)
+          if False: the `metadata` gets only the most informative fields.
+    """
+
+    base_url_esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
+    base_url_efetch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
+    max_retry = 5
+    sleep_time = 0.2
+
+    # Default values for the parameters
+    top_k_results: int = 3
+    load_max_docs: int = 25
+    ARXIV_MAX_QUERY_LENGTH = 300
+    doc_content_chars_max: int = 2000
+    load_all_available_meta: bool = False
+    email: str = "your_email@example.com"
+
+    def run(self, query: str) -> str:
+        """
+        Run PubMed search and get the article meta information.
+        See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
+        It uses only the most informative fields of article meta information.
+        """
+
+        try:
+            # Retrieve the top-k results for the query
+            docs = [
+                f"Published: {result['pub_date']}\nTitle: {result['title']}\n"
+                f"Summary: {result['summary']}"
+                for result in self.load(query[: self.ARXIV_MAX_QUERY_LENGTH])
+            ]
+
+            # Join the results and limit the character count
+            return (
+                "\n\n".join(docs)[:self.doc_content_chars_max]
+                if docs
+                else "No good PubMed Result was found"
+            )
+        except Exception as ex:
+            return f"PubMed exception: {ex}"
+
+    def load(self, query: str) -> list[dict]:
+        """
+        Search PubMed for documents matching the query.
+        Return a list of dictionaries containing the document metadata.
+        """
+
+        url = (
+            self.base_url_esearch
+            + "db=pubmed&term="
+            + str({urllib.parse.quote(query)})
+            + f"&retmode=json&retmax={self.top_k_results}&usehistory=y"
+        )
+        result = urllib.request.urlopen(url)
+        text = result.read().decode("utf-8")
+        json_text = json.loads(text)
+
+        articles = []
+        webenv = json_text["esearchresult"]["webenv"]
+        for uid in json_text["esearchresult"]["idlist"]:
+            article = self.retrieve_article(uid, webenv)
+            articles.append(article)
+
+        # Convert the list of articles to a JSON string
+        return articles
+
+    def retrieve_article(self, uid: str, webenv: str) -> dict:
+        url = (
+            self.base_url_efetch
+            + "db=pubmed&retmode=xml&id="
+            + uid
+            + "&webenv="
+            + webenv
+        )
+
+        retry = 0
+        while True:
+            try:
+                result = urllib.request.urlopen(url)
+                break
+            except urllib.error.HTTPError as e:
+                if e.code == 429 and retry < self.max_retry:
+                    # Too Many Requests error
+                    # wait for an exponentially increasing amount of time
+                    print(
+                        f"Too Many Requests, "
+                        f"waiting for {self.sleep_time:.2f} seconds..."
+                    )
+                    time.sleep(self.sleep_time)
+                    self.sleep_time *= 2
+                    retry += 1
+                else:
+                    raise e
+
+        xml_text = result.read().decode("utf-8")
+
+        # Get title
+        title = ""
+        if "<ArticleTitle>" in xml_text and "</ArticleTitle>" in xml_text:
+            start_tag = "<ArticleTitle>"
+            end_tag = "</ArticleTitle>"
+            title = xml_text[
+                xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
+            ]
+
+        # Get abstract
+        abstract = ""
+        if "<AbstractText>" in xml_text and "</AbstractText>" in xml_text:
+            start_tag = "<AbstractText>"
+            end_tag = "</AbstractText>"
+            abstract = xml_text[
+                xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
+            ]
+
+        # Get publication date
+        pub_date = ""
+        if "<PubDate>" in xml_text and "</PubDate>" in xml_text:
+            start_tag = "<PubDate>"
+            end_tag = "</PubDate>"
+            pub_date = xml_text[
+                xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
+            ]
+
+        # Return article as dictionary
+        article = {
+            "uid": uid,
+            "title": title,
+            "summary": abstract,
+            "pub_date": pub_date,
+        }
+        return article
+
+
+class PubmedQueryRun(BaseModel):
+    """Tool that searches the PubMed API."""
+
+    name = "PubMed"
+    description = (
+        "A wrapper around PubMed.org "
+        "Useful for when you need to answer questions about Physics, Mathematics, "
+        "Computer Science, Quantitative Biology, Quantitative Finance, Statistics, "
+        "Electrical Engineering, and Economics "
+        "from scientific articles on PubMed.org. "
+        "Input should be a search query."
+    )
+    api_wrapper: PubMedAPIWrapper = Field(default_factory=PubMedAPIWrapper)
+
+    def _run(
+        self,
+        query: str,
+    ) -> str:
+        """Use the Arxiv tool."""
+        return self.api_wrapper.run(query)
+
+
 class PubMedInput(BaseModel):
    query: str = Field(..., description="Search query.")

-
 class PubMedSearchTool(BuiltinTool):
    """
    Tool for performing a search using PubMed search engine.
@@ -34,7 +205,7 @@ class PubMedSearchTool(BuiltinTool):

        tool = PubmedQueryRun(args_schema=PubMedInput)

-        result = tool.run(query)
+        result = tool._run(query)

        return self.create_text_message(self.summary(user_id=user_id, content=result))