WIP: podcast feature

2024-10-24 13:25:01 -03:00 · 2024-10-24 13:25:01 -03:00 · 177d2c2b93
commit 177d2c2b93
parent 10ec265a44
4 changed files with 686 additions and 28 deletions
--- a/open_notebook/graphs/website_extractor.py
+++ b/open_notebook/graphs/website_extractor.py
@ -0,0 +1,179 @@
+"""
+Website Extractor Module
+
+This module is responsible for extracting clean text content from websites using
+BeautifulSoup for local HTML parsing instead of the Jina AI API.
+"""
+
+import html
+import logging
+import re
+from typing import List
+from urllib.parse import urlparse
+
+import requests
+from bs4 import BeautifulSoup
+from podcastfy.utils.config import load_config
+
+logger = logging.getLogger(__name__)
+
+
+class WebsiteExtractor:
+    def __init__(self):
+        """
+        Initialize the WebsiteExtractor.
+        """
+        self.config = load_config()
+        self.website_extractor_config = self.config.get("website_extractor", {})
+        self.unwanted_tags = self.website_extractor_config.get("unwanted_tags", [])
+        self.user_agent = self.website_extractor_config.get("user_agent", "Mozilla/5.0")
+        self.timeout = self.website_extractor_config.get("timeout", 10)
+        self.remove_patterns = self.website_extractor_config.get(
+            "markdown_cleaning", {}
+        ).get("remove_patterns", [])
+
+    def extract_content(self, url: str) -> str:
+        """
+        Extract clean text content from a website using BeautifulSoup.
+
+        Args:
+                url (str): Website URL.
+
+        Returns:
+                str: Extracted clean text content.
+
+        Raises:
+                Exception: If there's an error in extracting the content.
+        """
+        try:
+            # Normalize the URL
+            normalized_url = self.normalize_url(url)
+
+            # Request the webpage
+            headers = {"User-Agent": self.user_agent}
+            response = requests.get(
+                normalized_url, headers=headers, timeout=self.timeout
+            )
+            response.raise_for_status()  # Raise an exception for bad status codes
+
+            # Parse the page content with BeautifulSoup
+            soup = BeautifulSoup(response.text, "html.parser")
+
+            # Remove unwanted elements
+            self.remove_unwanted_elements(soup)
+
+            # Extract and clean the text content
+            raw_text = soup.get_text(separator="\n")  # Get all text content
+            cleaned_content = self.clean_content(raw_text)
+
+            return cleaned_content
+        except requests.RequestException as e:
+            logger.error(f"Failed to extract content from {url}: {str(e)}")
+            raise Exception(f"Failed to extract content from {url}: {str(e)}")
+        except Exception as e:
+            logger.error(
+                f"An unexpected error occurred while extracting content from {url}: {str(e)}"
+            )
+            raise Exception(
+                f"An unexpected error occurred while extracting content from {url}: {str(e)}"
+            )
+
+    def normalize_url(self, url: str) -> str:
+        """
+        Normalize the given URL by adding scheme if missing and ensuring it's a valid URL.
+
+        Args:
+                url (str): The URL to normalize.
+
+        Returns:
+                str: The normalized URL.
+
+        Raises:
+                ValueError: If the URL is invalid after normalization attempts.
+        """
+        # If the URL doesn't start with a scheme, add 'https://'
+        if not url.startswith(("http://", "https://")):
+            url = "https://" + url
+
+        # Parse the URL
+        parsed = urlparse(url)
+
+        # Ensure the URL has a valid scheme and netloc
+        if not all([parsed.scheme, parsed.netloc]):
+            raise ValueError(f"Invalid URL: {url}")
+
+        return parsed.geturl()
+
+    def remove_unwanted_elements(self, soup: BeautifulSoup) -> None:
+        """
+        Remove unwanted elements from the BeautifulSoup object.
+
+        Args:
+                soup (BeautifulSoup): The BeautifulSoup object to clean.
+        """
+        for tag in self.unwanted_tags:
+            for element in soup.find_all(tag):
+                element.decompose()
+
+    def clean_content(self, content: str) -> str:
+        """
+        Clean the extracted content by removing unnecessary whitespace and applying
+        custom cleaning patterns.
+
+        Args:
+                content (str): The content to clean.
+
+        Returns:
+                str: Cleaned text content.
+        """
+        # Decode HTML entities
+        cleaned_content = html.unescape(content)
+
+        # Remove extra whitespace
+        cleaned_content = re.sub(r"\s+", " ", cleaned_content)
+
+        # Remove extra newlines
+        cleaned_content = re.sub(r"\n{3,}", "\n\n", cleaned_content)
+
+        # Apply custom cleaning patterns from config
+        for pattern in self.remove_patterns:
+            cleaned_content = re.sub(pattern, "", cleaned_content)
+
+        return cleaned_content.strip()
+
+
+def main(seed: int = 42) -> None:
+    """
+    Main function to test the WebsiteExtractor class.
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    # Create an instance of WebsiteExtractor
+    extractor = WebsiteExtractor()
+
+    # Test URLs
+    test_urls: List[str] = [
+        "www.souzatharsis.com",
+        "https://en.wikipedia.org/wiki/Web_scraping",
+    ]
+
+    for url in test_urls:
+        try:
+            logger.info(f"Extracting content from: {url}")
+            content = extractor.extract_content(url)
+
+            # Print the first 500 characters of the extracted content
+            logger.info(
+                f"Extracted content (first 500 characters):\n{content[:500]}..."
+            )
+
+            # Print the total length of the extracted content
+            logger.info(f"Total length of extracted content: {len(content)} characters")
+            logger.info("-" * 50)
+
+        except Exception as e:
+            logger.error(f"An error occurred while processing {url}: {str(e)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/open_notebook/plugins/podcasts.py
+++ b/open_notebook/plugins/podcasts.py
@ -0,0 +1,307 @@
+from typing import ClassVar, List, Literal
+
+from pydantic import Field, field_validator
+
+from open_notebook.domain import ObjectModel
+
+
+class PodcastEpisode(ObjectModel):
+    table_name: ClassVar[str] = "podcast_episode"
+    name: str
+    template: str
+    instructions: str
+    file_path: str
+
+
+class PodcastConfig(ObjectModel):
+    table_name: ClassVar[str] = "podcast_config"
+    name: str
+    podcast_name: str
+    podcast_tagline: str
+    output_language: str = Field(default="English")
+    person1_role: str
+    person2_role: str
+    conversation_style: List[str]
+    engagement_technique: List[str]
+    dialogue_structure: List[str]
+    wordcount: int = Field(gt=500, lt=10000)
+    creativity: float = Field(ge=0, le=1)
+    provider: Literal["openai", "elevenlabs", "edge"] = Field(default="openai")
+    voice1: str
+    voice2: str
+    model: str
+
+    @field_validator("wordcount")
+    def validate_wordcount(cls, value):
+        if not 500 <= value <= 6000:
+            raise ValueError("Wordcount must be between 500 and 10000")
+        return value
+
+    @field_validator("creativity")
+    def validate_creativity(cls, value):
+        if not 0 <= value <= 1:
+            raise ValueError("Creativity must be between 0 and 1")
+        return value
+
+
+conversation_styles = [
+    "Analytical",
+    "Argumentative",
+    "Informative",
+    "Humorous",
+    "Casual",
+    "Formal",
+    "Inspirational",
+    "Debate-style",
+    "Interview-style",
+    "Storytelling",
+    "Reflective",
+    "Narrative",
+    "Satirical",
+    "Educational",
+    "Conversational",
+    "Critical",
+    "Empathetic",
+    "Philosophical",
+    "Speculative",
+    "Motivational",
+    "Fun",
+    "Technical",
+    "Light-hearted",
+    "Serious",
+    "Investigative",
+    "Debunking",
+    "Collaborative",
+    "Didactic",
+    "Thought-provoking",
+    "Controversial",
+    "Skeptical",
+    "Optimistic",
+    "Pessimistic",
+    "Objective",
+    "Subjective",
+    "Sarcastic",
+    "Emotional",
+    "Exploratory",
+    "Friendly",
+    "Fast-paced",
+    "Slow-paced",
+    "Introspective",
+    "Open-ended",
+    "Affirmative",
+    "Dissenting",
+]
+
+# Dialogue Structures
+dialogue_structures = [
+    "Topic Introduction",
+    "Opening Monologue",
+    "Guest Introduction",
+    "Icebreakers",
+    "Historical Context",
+    "Defining Terms",
+    "Problem Statement",
+    "Overview of the Issue",
+    "Deep Dive into Subtopics",
+    "Pro Arguments",
+    "Con Arguments",
+    "Cross-examination",
+    "Rebuttal",
+    "Expert Interviews",
+    "Panel Discussion",
+    "Case Studies",
+    "Myth Busting",
+    "Debunking Misconceptions",
+    "Audience Questions",
+    "Q&A Session",
+    "Listener Feedback",
+    "Rapid-fire Questions",
+    "Summary of Key Points",
+    "Recap",
+    "Key Takeaways",
+    "Actionable Tips",
+    "Call to Action",
+    "Future Outlook",
+    "Teaser for Next Episode",
+    "Closing Remarks",
+    "Thank You and Credits",
+    "Outtakes or Bloopers",
+    "Sponsor Messages",
+    "Social Media Shout-outs",
+    "Resource Recommendations",
+    "Feedback Request",
+    "Lightning Round",
+    "Behind-the-Scenes Insights",
+    "Ethical Considerations",
+    "Fact-checking Segment",
+    "Trending Topics",
+    "Closing Inspirational Quote",
+    "Final Reflections",
+    "Debrief",
+    "Farewell Messages",
+    "Next Episode Preview",
+    "Live Reactions",
+    "Call-in Segment",
+    "Acknowledgements",
+    "Transition Segments",
+    "Break Segments",
+]
+
+# Podcast Participant Roles
+participant_roles = [
+    "Main Summarizer",
+    "Questioner/Clarifier",
+    "Optimist",
+    "Skeptic",
+    "Specialist",
+    "Thesis Presenter",
+    "Counterargument Provider",
+    "Professor",
+    "Student",
+    "Moderator",
+    "Host",
+    "Co-host",
+    "Expert Guest",
+    "Novice",
+    "Devil's Advocate",
+    "Analyst",
+    "Storyteller",
+    "Fact-checker",
+    "Comedian",
+    "Interviewer",
+    "Interviewee",
+    "Historian",
+    "Visionary",
+    "Strategist",
+    "Critic",
+    "Enthusiast",
+    "Mediator",
+    "Commentator",
+    "Researcher",
+    "Reporter",
+    "Advocate",
+    "Influencer",
+    "Observer",
+    "Listener",
+    "Facilitator",
+    "Innovator",
+    "Debater",
+    "Educator",
+    "Motivator",
+    "Narrator",
+    "Explorer",
+    "Opponent",
+    "Proponent",
+    "Philosopher",
+    "Engineer",
+    "Doctor",
+    "Psychologist",
+    "Economist",
+    "Politician",
+    "Scientist",
+    "Entrepreneur",
+    "Artist",
+    "Author",
+    "Journalist",
+    "Activist",
+    "Challenger",
+    "Supporter",
+    "Mentor",
+    "Mentee",
+    "Panelist",
+    "Audience Representative",
+    "Case Study Presenter",
+    "Data Analyst",
+    "Ethicist",
+    "Cultural Critic",
+    "Technologist",
+    "Environmentalist",
+    "Legal Expert",
+    "Healthcare Professional",
+    "Financial Advisor",
+    "Policy Maker",
+    "Sociologist",
+    "Anthropologist",
+    "Myth Buster",
+    "Trend Analyst",
+    "Futurist",
+    "Negotiator",
+    "Community Leader",
+    "Voice of Reason",
+    "Conflict Resolver",
+    "Emotional Support",
+    "Pragmatist",
+    "Idealist",
+    "Realist",
+    "Satirist",
+    "Story Analyst",
+    "Language Expert",
+    "Historical Witness",
+    "Survivor",
+    "Inspirational Figure",
+    "Cultural Ambassador",
+    "Digital Nomad",
+    "Remote Correspondent",
+    "Field Reporter",
+    "Data Scientist",
+    "Gamer",
+    "Musician",
+    "Filmmaker",
+]
+
+# Engagement Techniques
+engagement_techniques = [
+    "Rhetorical Questions",
+    "Anecdotes",
+    "Analogies",
+    "Humor",
+    "Metaphors",
+    "Storytelling",
+    "Quizzes",
+    "Polls",
+    "Contests/Giveaways",
+    "Guest Appearances",
+    "Sound Effects",
+    "Music Interludes",
+    "Shout-outs",
+    "Interactive Challenges",
+    "Personal Testimonials",
+    "Quotes",
+    "Jokes",
+    "Surprise Elements",
+    "Emotional Appeals",
+    "Provocative Statements",
+    "Irony",
+    "Sarcasm",
+    "Alliteration",
+    "Repetition",
+    "Foreshadowing",
+    "Cliffhangers",
+    "Audience Participation",
+    "Sensory Descriptions",
+    "Visual Aids (if applicable)",
+    "Callbacks to Earlier Points",
+    "Pop Culture References",
+    "Hyperbole",
+    "Parables",
+    "Thought Experiments",
+    "Puzzles and Riddles",
+    "Role-playing",
+    "Mock Scenarios",
+    "Debates",
+    "Sound Bites",
+    "Catchphrases",
+    "Voice Modulation",
+    "Interactive Games",
+    "Live Demos",
+    "Behind-the-Scenes Insights",
+    "Vivid Imagery",
+    "Statistics and Facts",
+    "Open-ended Questions",
+    "Challenges to Assumptions",
+    "Evoking Curiosity",
+    "Memes (if visual components are included)",
+    "Surveys",
+    "Testimonials",
+    "Provocations",
+]
--- a/pages/5_🎙️_Podcasts.py
+++ b/pages/5_🎙️_Podcasts.py
@ -0,0 +1,150 @@
+import streamlit as st
+from streamlit_tags import st_tags
+
+from open_notebook.plugins.podcasts import (
+    PodcastConfig,
+    PodcastEpisode,
+    conversation_styles,
+    dialogue_structures,
+    engagement_techniques,
+    participant_roles,
+)
+
+episodes_tab, templates_tab = st.tabs(["Episodes", "Templates"])
+
+with episodes_tab:
+    episodes = PodcastEpisode.get_all()
+    for episode in episodes:
+        st.json(episode.model_dump())
+    else:
+        st.write("No episodes yet")
+with templates_tab:
+    st.subheader("Podcast Templates")
+    st.markdown("")
+    with st.expander("**Create new Template**"):
+        pd_cfg = {}
+        pd_cfg["name"] = st.text_input("Template Name")
+        pd_cfg["podcast_name"] = st.text_input("Podcast Name")
+        pd_cfg["podcast_tagline"] = st.text_input("Podcast Tagline")
+        pd_cfg["output_language"] = st.text_input("Language", value="English")
+        pd_cfg["person1_role"] = st.text_input("Person 1 role")
+        st.caption(f"Suggestions:{', '.join(participant_roles)}")
+        pd_cfg["person2_role"] = st.text_input("Person 2 role")
+        pd_cfg["conversation_style"] = st_tags(
+            ["a"], conversation_styles, "Conversation Style"
+        )
+        st.caption(f"Suggestions:{', '.join(conversation_styles)}")
+        pd_cfg["engagement_technique"] = st_tags(
+            [], engagement_techniques, "Engagement Techniques"
+        )
+        st.caption(f"Suggestions:{', '.join(engagement_techniques)}")
+        pd_cfg["dialogue_structure"] = st_tags(
+            [], dialogue_structures, "Dialogue Structure"
+        )
+        st.caption(f"Suggestions:{', '.join(dialogue_structures)}")
+        pd_cfg["wordcount"] = st.slider(
+            "Word Count", min_value=400, max_value=6000, step=50
+        )
+        pd_cfg["creativity"] = st.slider(
+            "Creativity", min_value=0.0, max_value=1.0, step=0.05
+        )
+        pd_cfg["provider"] = st.selectbox("Provider", ["openai", "elevenlabs", "edge"])
+        pd_cfg["voice1"] = st.text_input("Voice 1")
+        pd_cfg["voice2"] = st.text_input("Voice 2")
+        pd_cfg["model"] = st.text_input("Model")
+        if st.button("Save"):
+            pd = PodcastConfig(**pd_cfg)
+            pd.save()
+            st.success("Saved")
+
+    for pd_config in PodcastConfig.get_all():
+        with st.expander(pd_config.name):
+            pd_config.name = st.text_input(
+                "Template Name", value=pd_config.name, key=f"name_{pd_config.id}"
+            )
+            pd_config.podcast_name = st.text_input(
+                "Podcast Name",
+                value=pd_config.podcast_name,
+                key=f"podcast_name_{pd_config.id}",
+            )
+            pd_config.podcast_tagline = st.text_input(
+                "Podcast Tagline",
+                value=pd_config.podcast_tagline,
+                key=f"podcast_tagline_{pd_config.id}",
+            )
+            pd_config.output_language = st.text_input(
+                "Language",
+                value=pd_config.output_language,
+                key=f"output_language_{pd_config.id}",
+            )
+            pd_config.person1_role = st.text_input(
+                "Person 1 role",
+                value=pd_config.person1_role,
+                key=f"person1_role_{pd_config.id}",
+            )
+            st.caption(f"Suggestions:{', '.join(participant_roles)}")
+            pd_config.person2_role = st.text_input(
+                "Person 2 role",
+                value=pd_config.person2_role,
+                key=f"person2_role_{pd_config.id}",
+            )
+            pd_config.conversation_style = st_tags(
+                pd_config.conversation_style,
+                conversation_styles,
+                "Conversation Style",
+                key=f"conversation_style_{pd_config.id}",
+            )
+            st.caption(f"Suggestions:{', '.join(conversation_styles)}")
+            pd_config.engagement_technique = st_tags(
+                pd_config.engagement_technique,
+                engagement_techniques,
+                "Engagement Techniques",
+                key=f"engagement_technique_{pd_config.id}",
+            )
+            st.caption(f"Suggestions:{', '.join(engagement_techniques)}")
+            pd_config.dialogue_structure = st_tags(
+                pd_config.dialogue_structure,
+                dialogue_structures,
+                "Dialogue Structure",
+                key=f"dialogue_structure_{pd_config.id}",
+            )
+            st.caption(f"Suggestions:{', '.join(dialogue_structures)}")
+            pd_config.wordcount = st.slider(
+                "Word Count",
+                min_value=400,
+                max_value=6000,
+                step=50,
+                value=pd_config.wordcount,
+                key=f"wordcount_{pd_config.id}",
+            )
+            pd_config.creativity = st.slider(
+                "Creativity",
+                min_value=0.0,
+                max_value=1.0,
+                step=0.05,
+                value=pd_config.creativity,
+                key=f"creativity_{pd_config.id}",
+            )
+            pd_config.provider = st.selectbox(
+                "Provider",
+                ["openai", "elevenlabs", "edge"],
+                index=["openai", "elevenlabs", "edge"].index(pd_config.provider),
+                key=f"provider_{pd_config.id}",
+            )
+            pd_config.voice1 = st.text_input(
+                "Voice 1", value=pd_config.voice1, key=f"voice1_{pd_config.id}"
+            )
+            pd_config.voice2 = st.text_input(
+                "Voice 2", value=pd_config.voice2, key=f"voice2_{pd_config.id}"
+            )
+            pd_config.model = st.text_input(
+                "Model", value=pd_config.model, key=f"model_{pd_config.id}"
+            )
+
+            if st.button("Save Config", key=f"btn_save{pd_config.id}"):
+                pd_config.save()
+                st.rerun()
+
+            if st.button("Delete Config", key=f"btn_delete{pd_config.id}"):
+                pd_config.delete()
+                st.rerun()
--- a/stream_app/chat.py
+++ b/stream_app/chat.py
@ -3,6 +3,7 @@ from langchain_core.runnables import RunnableConfig

 from open_notebook.domain import Note, Source
 from open_notebook.graphs.chat import graph as chat_graph
+from open_notebook.plugins.podcasts import PodcastConfig, PodcastEpisode
 from open_notebook.utils import token_count


@ -52,38 +53,59 @@ def execute_chat(txt_input, session_id):
    return result


+podcast_configs = PodcastConfig.get_all()
+podcast_config_names = [pd.name for pd in podcast_configs]
+
+
 # todo: se eu for usar o token count, preciso deixar configuravel
 # seria bom ter um total de tokens no admin em algum lugar
 def chat_sidebar(session_id):
    context = build_context(session_id=session_id)
    tokens = token_count(str(context) + str(st.session_state[session_id]["messages"]))
-    with st.container(border=True):
-        request = st.chat_input("Enter your question")
-        # removing for now since it's not multi-model capable right now
-        st.caption(f"Total tokens: {tokens}")
-        if request:
-            response = execute_chat(txt_input=request, session_id=session_id)
-            st.session_state[session_id]["messages"] = response["messages"]
+    chat_tab, podcast_tab = st.tabs(["Chat", "Podcast"])
+    with podcast_tab:
+        with st.container(border=True):
+            template = st.selectbox("Pick a template", podcast_config_names)
+            episode_name = st.text_input("Episode Name")
+            instructions = st.text_area("Instructions")
+            if st.button("Generate"):
+                epi = PodcastEpisode(
+                    name=episode_name,
+                    instructions=instructions,
+                    template=template,
+                    file_path="lallaa",
+                )
+                epi.save()
+            st.page_link("pages/5_🎙️_Podcasts.py", label="Go to Config")
+            st.divider()
+    with chat_tab:
+        with st.container(border=True):
+            request = st.chat_input("Enter your question")
+            # removing for now since it's not multi-model capable right now
+            st.caption(f"Total tokens: {tokens}")
+            if request:
+                response = execute_chat(txt_input=request, session_id=session_id)
+                st.session_state[session_id]["messages"] = response["messages"]

-        for msg in st.session_state[session_id]["messages"][::-1]:
-            if msg.type not in ["human", "ai"]:
-                continue
-            if not msg.content:
-                continue
+            for msg in st.session_state[session_id]["messages"][::-1]:
+                if msg.type not in ["human", "ai"]:
+                    continue
+                if not msg.content:
+                    continue

-            with st.chat_message(name=msg.type):
-                st.write(msg.content)
-                if msg.type == "ai":
-                    if st.button("💾 New Note", key=f"render_save_{msg.id}"):
-                        title = "New Note"
-                        content = msg.content
-                        note = Note(
-                            title=title,
-                            content=content,
-                            note_type="ai",
-                        )
-                        note.save()
-                        note.add_to_notebook(
-                            st.session_state[session_id]["notebook"].id
-                        )
-                        st.rerun()
+                with st.chat_message(name=msg.type):
+                    st.write(msg.content)
+                    if msg.type == "ai":
+                        if st.button("💾 New Note", key=f"render_save_{msg.id}"):
+                            title = "New Note"
+                            content = msg.content
+                            note = Note(
+                                title=title,
+                                content=content,
+                                note_type="ai",
+                            )
+                            note.save()
+                            note.add_to_notebook(
+                                st.session_state[session_id]["notebook"].id
+                            )
+                            st.rerun()