Expand X's t.co links (#91)

This PR expands t.co links in user bios and user's link section. This PR also expands t.co links that are contained within tweets. ![image](https://github.com/user-attachments/assets/15c72c7b-1950-46a3-872c-dae45119bc81)
2024-10-04 12:42:11 -07:00 · 2024-10-04 12:42:11 -07:00 · 844403906d
commit 844403906d
parent b52f6daa6f
3 changed files with 99 additions and 31 deletions
--- a/toolkits/x/arcade_x/tools/tweets.py
+++ b/toolkits/x/arcade_x/tools/tweets.py
@ -6,7 +6,11 @@ from arcade.core.errors import ToolExecutionError
 from arcade.core.schema import ToolContext
 from arcade.sdk import tool
 from arcade.sdk.auth import X
-from arcade_x.tools.utils import get_tweet_url, parse_search_recent_tweets_response
+from arcade_x.tools.utils import (
+    expand_urls_in_tweets,
+    get_tweet_url,
+    parse_search_recent_tweets_response,
+)

 TWEETS_URL = "https://api.x.com/2/tweets"

@ -69,7 +73,7 @@ async def search_recent_tweets_by_username(
    max_results: Annotated[
        int, "The maximum number of results to return. Cannot be less than 10"
    ] = 10,
-) -> Annotated[str, "JSON string of the search results"]:
+) -> Annotated[dict, "Dictionary containing the search results"]:
    """Search for recent tweets (last 7 days) on X (Twitter) by username. Includes replies and reposts."""

    headers = {
@ -80,9 +84,7 @@ async def search_recent_tweets_by_username(
        "query": f"from:{username}",
        "max_results": max(max_results, 10),  # X API does not allow 'max_results' less than 10
    }
-    url = (
-        "https://api.x.com/2/tweets/search/recent?expansions=author_id&user.fields=id,name,username"
-    )
+    url = "https://api.x.com/2/tweets/search/recent?expansions=author_id&user.fields=id,name,username,entities&tweet.fields=entities"

    async with httpx.AsyncClient() as client:
        response = await client.get(url, headers=headers, params=params, timeout=10)
@ -92,9 +94,14 @@ async def search_recent_tweets_by_username(
            f"Failed to search recent tweets during execution of '{search_recent_tweets_by_username.__name__}' tool. Request returned an error: {response.status_code} {response.text}"
        )

-    tweets_data = parse_search_recent_tweets_response(response)
+    response_data = response.json()

-    return tweets_data
+    # Expand the urls that are in the tweets
+    expand_urls_in_tweets(response_data.get("data", []), delete_entities=True)
+
+    parse_search_recent_tweets_response(response_data)
+
+    return response_data


@tool(requires_auth=X(scopes=["tweet.read", "users.read"]))
@ -109,7 +116,7 @@ async def search_recent_tweets_by_keywords(
    max_results: Annotated[
        int, "The maximum number of results to return. Cannot be less than 10"
    ] = 10,
-) -> Annotated[str, "JSON string of the search results"]:
+) -> Annotated[dict, "Dictionary containing the search results"]:
    """
    Search for recent tweets (last 7 days) on X (Twitter) by required keywords and phrases. Includes replies and reposts
    One of the following input parametersMUST be provided: keywords, phrases
@ -124,7 +131,7 @@ async def search_recent_tweets_by_keywords(
        "Authorization": f"Bearer {context.authorization.token}",
        "Content-Type": "application/json",
    }
-    query = " ".join([f'"{phrase}"' for phrase in (phrases or [])]) + " " + " "
+    query = "".join([f'"{phrase}" ' for phrase in (phrases or [])])
    if keywords:
        query += " ".join(keywords or [])

@ -132,9 +139,7 @@ async def search_recent_tweets_by_keywords(
        "query": query,
        "max_results": max(max_results, 10),  # X API does not allow 'max_results' less than 10
    }
-    url = (
-        "https://api.x.com/2/tweets/search/recent?expansions=author_id&user.fields=id,name,username"
-    )
+    url = "https://api.x.com/2/tweets/search/recent?expansions=author_id&user.fields=id,name,username,entities&tweet.fields=entities"

    async with httpx.AsyncClient() as client:
        response = await client.get(url, headers=headers, params=params, timeout=10)
@ -144,6 +149,11 @@ async def search_recent_tweets_by_keywords(
            f"Failed to search recent tweets during execution of '{search_recent_tweets_by_keywords.__name__}' tool. Request returned an error: {response.status_code} {response.text}"
        )

-    tweets_data = parse_search_recent_tweets_response(response)
+    response_data = response.json()

-    return tweets_data
+    # Expand the urls that are in the tweets
+    expand_urls_in_tweets(response_data.get("data", []), delete_entities=True)
+
+    parse_search_recent_tweets_response(response_data)
+
+    return response_data
--- a/toolkits/x/arcade_x/tools/users.py
+++ b/toolkits/x/arcade_x/tools/users.py
@ -6,6 +6,7 @@ from arcade.core.errors import ToolExecutionError
 from arcade.core.schema import ToolContext
 from arcade.sdk import tool
 from arcade.sdk.auth import X
+from arcade_x.tools.utils import expand_urls_in_user_description, expand_urls_in_user_url


 # Users Lookup Tools. See developer docs for additional available query parameters: https://developer.x.com/en/docs/x-api/users/lookup/api-reference
@ -13,13 +14,13 @@ from arcade.sdk.auth import X
 async def lookup_single_user_by_username(
    context: ToolContext,
    username: Annotated[str, "The username of the X (Twitter) user to look up"],
-) -> Annotated[str, "User information including id, name, username, and description"]:
+) -> Annotated[dict, "User information including id, name, username, and description"]:
    """Look up a user on X (Twitter) by their username."""

    headers = {
        "Authorization": f"Bearer {context.authorization.token}",
    }
-    url = f"https://api.x.com/2/users/by/username/{username}?user.fields=created_at,description,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld"
+    url = f"https://api.x.com/2/users/by/username/{username}?user.fields=created_at,description,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld,entities"

    async with httpx.AsyncClient() as client:
        response = await client.get(url, headers=headers, timeout=10)
@ -29,8 +30,14 @@ async def lookup_single_user_by_username(
            f"Failed to look up user during execution of '{lookup_single_user_by_username.__name__}' tool. Request returned an error: {response.status_code} {response.text}"
        )

+    # Parse the response JSON
+    user_data = response.json()["data"]
+
+    expand_urls_in_user_description(user_data, delete_entities=False)
+    expand_urls_in_user_url(user_data, delete_entities=True)
+
    """
-    Example response.text structure:
+    Example response["data"] structure:
    {
        "data": {
            "verified_type": str,
@ -55,4 +62,4 @@ async def lookup_single_user_by_username(
        }
    }
    """
-    return response.text
+    return {"data": user_data}
--- a/toolkits/x/arcade_x/tools/utils.py
+++ b/toolkits/x/arcade_x/tools/utils.py
@ -1,6 +1,4 @@
-import json
-
-from requests import Response
+from typing import Any


 def get_tweet_url(tweet_id: str) -> str:
@ -8,7 +6,7 @@ def get_tweet_url(tweet_id: str) -> str:
    return f"https://x.com/x/status/{tweet_id}"


-def parse_search_recent_tweets_response(response: Response) -> str:
+def parse_search_recent_tweets_response(response_data: Any) -> dict:
    """
    Parses response from the X API search recent tweets endpoint.
    Returns a JSON string with the tweets data.
@ -28,22 +26,18 @@ def parse_search_recent_tweets_response(response: Response) -> str:
        },
    ]
    """
-    if response.status_code != 200:
-        return json.dumps({"tweets": []})

-    tweets_data = json.loads(response.text)
+    if not sanity_check_tweets_data(response_data):
+        return {"data": []}

-    if not sanity_check_tweets_data(tweets_data):
-        return json.dumps({"tweets": []})
-
-    for tweet in tweets_data["data"]:
+    for tweet in response_data["data"]:
        tweet["tweet_url"] = get_tweet_url(tweet["id"])

-    for tweet_data, user_data in zip(tweets_data["data"], tweets_data["includes"]["users"]):
+    for tweet_data, user_data in zip(response_data["data"], response_data["includes"]["users"]):
        tweet_data["author_username"] = user_data["username"]
        tweet_data["author_name"] = user_data["name"]

-    return json.dumps({"tweets": tweets_data["data"]})
+    return response_data


 def sanity_check_tweets_data(tweets_data: dict) -> bool:
@ -54,3 +48,60 @@ def sanity_check_tweets_data(tweets_data: dict) -> bool:
    if not tweets_data.get("data", []):
        return False
    return tweets_data.get("includes", {}).get("users", [])
+
+
+def expand_urls_in_tweets(tweets_data: list[dict], delete_entities: bool = True) -> None:
+    """
+    Expands the urls in the test of the provided tweets.
+    X shortens urls, and consequently, this can cause language models to hallucinate.
+    See more about X's link shortner at https://help.x.com/en/using-x/url-shortener
+    """
+    for tweet_data in tweets_data:
+        if "entities" in tweet_data and "urls" in tweet_data["entities"]:
+            for url_entity in tweet_data["entities"]["urls"]:
+                short_url = url_entity["url"]
+                expanded_url = url_entity["expanded_url"]
+                tweet_data["text"] = tweet_data["text"].replace(short_url, expanded_url)
+
+        if delete_entities:
+            tweet_data.pop(
+                "entities", None
+            )  # Now that we've expanded the urls in the tweet, we no longer need the entities
+
+
+def expand_urls_in_user_description(user_data: dict, delete_entities: bool = True) -> None:
+    """
+    Expands the urls in the description of the provided user.
+    X shortens urls, and consequently, this can cause language models to hallucinate.
+    See more about X's link shortner at https://help.x.com/en/using-x/url-shortener
+    """
+    description_urls = user_data.get("entities", {}).get("description", {}).get("urls", [])
+    description = user_data.get("description", "")
+    for url_info in description_urls:
+        t_co_link = url_info["url"]
+        expanded_url = url_info["expanded_url"]
+        description = description.replace(t_co_link, expanded_url)
+    user_data["description"] = description
+
+    if delete_entities:
+        # Entities is no longer needed now that we have expanded the t.co links
+        user_data.pop("entities", None)
+
+
+def expand_urls_in_user_url(user_data: dict, delete_entities: bool = True) -> None:
+    """
+    Expands the urls in the url section of the provided user.
+    X shortens urls, and consequently, this can cause language models to hallucinate.
+    See more about X's link shortner at https://help.x.com/en/using-x/url-shortener
+    """
+    url_urls = user_data.get("entities", {}).get("url", {}).get("urls", [])
+    url = user_data.get("url", "")
+    for url_info in url_urls:
+        t_co_link = url_info["url"]
+        expanded_url = url_info["expanded_url"]
+        url = url.replace(t_co_link, expanded_url)
+    user_data["url"] = url
+
+    if delete_entities:
+        # Entities is no longer needed now that we have expanded the t.co links
+        user_data.pop("entities", None)