diff --git a/open_notebook/graphs/website_extractor.py b/open_notebook/graphs/website_extractor.py new file mode 100644 index 0000000..fc4e69b --- /dev/null +++ b/open_notebook/graphs/website_extractor.py @@ -0,0 +1,179 @@ +""" +Website Extractor Module + +This module is responsible for extracting clean text content from websites using +BeautifulSoup for local HTML parsing instead of the Jina AI API. +""" + +import html +import logging +import re +from typing import List +from urllib.parse import urlparse + +import requests +from bs4 import BeautifulSoup +from podcastfy.utils.config import load_config + +logger = logging.getLogger(__name__) + + +class WebsiteExtractor: + def __init__(self): + """ + Initialize the WebsiteExtractor. + """ + self.config = load_config() + self.website_extractor_config = self.config.get("website_extractor", {}) + self.unwanted_tags = self.website_extractor_config.get("unwanted_tags", []) + self.user_agent = self.website_extractor_config.get("user_agent", "Mozilla/5.0") + self.timeout = self.website_extractor_config.get("timeout", 10) + self.remove_patterns = self.website_extractor_config.get( + "markdown_cleaning", {} + ).get("remove_patterns", []) + + def extract_content(self, url: str) -> str: + """ + Extract clean text content from a website using BeautifulSoup. + + Args: + url (str): Website URL. + + Returns: + str: Extracted clean text content. + + Raises: + Exception: If there's an error in extracting the content. + """ + try: + # Normalize the URL + normalized_url = self.normalize_url(url) + + # Request the webpage + headers = {"User-Agent": self.user_agent} + response = requests.get( + normalized_url, headers=headers, timeout=self.timeout + ) + response.raise_for_status() # Raise an exception for bad status codes + + # Parse the page content with BeautifulSoup + soup = BeautifulSoup(response.text, "html.parser") + + # Remove unwanted elements + self.remove_unwanted_elements(soup) + + # Extract and clean the text content + raw_text = soup.get_text(separator="\n") # Get all text content + cleaned_content = self.clean_content(raw_text) + + return cleaned_content + except requests.RequestException as e: + logger.error(f"Failed to extract content from {url}: {str(e)}") + raise Exception(f"Failed to extract content from {url}: {str(e)}") + except Exception as e: + logger.error( + f"An unexpected error occurred while extracting content from {url}: {str(e)}" + ) + raise Exception( + f"An unexpected error occurred while extracting content from {url}: {str(e)}" + ) + + def normalize_url(self, url: str) -> str: + """ + Normalize the given URL by adding scheme if missing and ensuring it's a valid URL. + + Args: + url (str): The URL to normalize. + + Returns: + str: The normalized URL. + + Raises: + ValueError: If the URL is invalid after normalization attempts. + """ + # If the URL doesn't start with a scheme, add 'https://' + if not url.startswith(("http://", "https://")): + url = "https://" + url + + # Parse the URL + parsed = urlparse(url) + + # Ensure the URL has a valid scheme and netloc + if not all([parsed.scheme, parsed.netloc]): + raise ValueError(f"Invalid URL: {url}") + + return parsed.geturl() + + def remove_unwanted_elements(self, soup: BeautifulSoup) -> None: + """ + Remove unwanted elements from the BeautifulSoup object. + + Args: + soup (BeautifulSoup): The BeautifulSoup object to clean. + """ + for tag in self.unwanted_tags: + for element in soup.find_all(tag): + element.decompose() + + def clean_content(self, content: str) -> str: + """ + Clean the extracted content by removing unnecessary whitespace and applying + custom cleaning patterns. + + Args: + content (str): The content to clean. + + Returns: + str: Cleaned text content. + """ + # Decode HTML entities + cleaned_content = html.unescape(content) + + # Remove extra whitespace + cleaned_content = re.sub(r"\s+", " ", cleaned_content) + + # Remove extra newlines + cleaned_content = re.sub(r"\n{3,}", "\n\n", cleaned_content) + + # Apply custom cleaning patterns from config + for pattern in self.remove_patterns: + cleaned_content = re.sub(pattern, "", cleaned_content) + + return cleaned_content.strip() + + +def main(seed: int = 42) -> None: + """ + Main function to test the WebsiteExtractor class. + """ + logging.basicConfig(level=logging.INFO) + + # Create an instance of WebsiteExtractor + extractor = WebsiteExtractor() + + # Test URLs + test_urls: List[str] = [ + "www.souzatharsis.com", + "https://en.wikipedia.org/wiki/Web_scraping", + ] + + for url in test_urls: + try: + logger.info(f"Extracting content from: {url}") + content = extractor.extract_content(url) + + # Print the first 500 characters of the extracted content + logger.info( + f"Extracted content (first 500 characters):\n{content[:500]}..." + ) + + # Print the total length of the extracted content + logger.info(f"Total length of extracted content: {len(content)} characters") + logger.info("-" * 50) + + except Exception as e: + logger.error(f"An error occurred while processing {url}: {str(e)}") + + +if __name__ == "__main__": + main() diff --git a/open_notebook/plugins/podcasts.py b/open_notebook/plugins/podcasts.py new file mode 100644 index 0000000..e0a1079 --- /dev/null +++ b/open_notebook/plugins/podcasts.py @@ -0,0 +1,307 @@ +from typing import ClassVar, List, Literal + +from pydantic import Field, field_validator + +from open_notebook.domain import ObjectModel + + +class PodcastEpisode(ObjectModel): + table_name: ClassVar[str] = "podcast_episode" + name: str + template: str + instructions: str + file_path: str + + +class PodcastConfig(ObjectModel): + table_name: ClassVar[str] = "podcast_config" + name: str + podcast_name: str + podcast_tagline: str + output_language: str = Field(default="English") + person1_role: str + person2_role: str + conversation_style: List[str] + engagement_technique: List[str] + dialogue_structure: List[str] + wordcount: int = Field(gt=500, lt=10000) + creativity: float = Field(ge=0, le=1) + provider: Literal["openai", "elevenlabs", "edge"] = Field(default="openai") + voice1: str + voice2: str + model: str + + @field_validator("wordcount") + def validate_wordcount(cls, value): + if not 500 <= value <= 6000: + raise ValueError("Wordcount must be between 500 and 10000") + return value + + @field_validator("creativity") + def validate_creativity(cls, value): + if not 0 <= value <= 1: + raise ValueError("Creativity must be between 0 and 1") + return value + + +conversation_styles = [ + "Analytical", + "Argumentative", + "Informative", + "Humorous", + "Casual", + "Formal", + "Inspirational", + "Debate-style", + "Interview-style", + "Storytelling", + "Reflective", + "Narrative", + "Satirical", + "Educational", + "Conversational", + "Critical", + "Empathetic", + "Philosophical", + "Speculative", + "Motivational", + "Fun", + "Technical", + "Light-hearted", + "Serious", + "Investigative", + "Debunking", + "Collaborative", + "Didactic", + "Thought-provoking", + "Controversial", + "Skeptical", + "Optimistic", + "Pessimistic", + "Objective", + "Subjective", + "Sarcastic", + "Emotional", + "Exploratory", + "Friendly", + "Fast-paced", + "Slow-paced", + "Introspective", + "Open-ended", + "Affirmative", + "Dissenting", +] + +# Dialogue Structures +dialogue_structures = [ + "Topic Introduction", + "Opening Monologue", + "Guest Introduction", + "Icebreakers", + "Historical Context", + "Defining Terms", + "Problem Statement", + "Overview of the Issue", + "Deep Dive into Subtopics", + "Pro Arguments", + "Con Arguments", + "Cross-examination", + "Rebuttal", + "Expert Interviews", + "Panel Discussion", + "Case Studies", + "Myth Busting", + "Debunking Misconceptions", + "Audience Questions", + "Q&A Session", + "Listener Feedback", + "Rapid-fire Questions", + "Summary of Key Points", + "Recap", + "Key Takeaways", + "Actionable Tips", + "Call to Action", + "Future Outlook", + "Teaser for Next Episode", + "Closing Remarks", + "Thank You and Credits", + "Outtakes or Bloopers", + "Sponsor Messages", + "Social Media Shout-outs", + "Resource Recommendations", + "Feedback Request", + "Lightning Round", + "Behind-the-Scenes Insights", + "Ethical Considerations", + "Fact-checking Segment", + "Trending Topics", + "Closing Inspirational Quote", + "Final Reflections", + "Debrief", + "Farewell Messages", + "Next Episode Preview", + "Live Reactions", + "Call-in Segment", + "Acknowledgements", + "Transition Segments", + "Break Segments", +] + +# Podcast Participant Roles +participant_roles = [ + "Main Summarizer", + "Questioner/Clarifier", + "Optimist", + "Skeptic", + "Specialist", + "Thesis Presenter", + "Counterargument Provider", + "Professor", + "Student", + "Moderator", + "Host", + "Co-host", + "Expert Guest", + "Novice", + "Devil's Advocate", + "Analyst", + "Storyteller", + "Fact-checker", + "Comedian", + "Interviewer", + "Interviewee", + "Historian", + "Visionary", + "Strategist", + "Critic", + "Enthusiast", + "Mediator", + "Commentator", + "Researcher", + "Reporter", + "Advocate", + "Influencer", + "Observer", + "Listener", + "Facilitator", + "Innovator", + "Debater", + "Educator", + "Motivator", + "Narrator", + "Explorer", + "Opponent", + "Proponent", + "Philosopher", + "Engineer", + "Doctor", + "Psychologist", + "Economist", + "Politician", + "Scientist", + "Entrepreneur", + "Artist", + "Author", + "Journalist", + "Activist", + "Challenger", + "Supporter", + "Mentor", + "Mentee", + "Panelist", + "Audience Representative", + "Case Study Presenter", + "Data Analyst", + "Ethicist", + "Cultural Critic", + "Technologist", + "Environmentalist", + "Legal Expert", + "Healthcare Professional", + "Financial Advisor", + "Policy Maker", + "Sociologist", + "Anthropologist", + "Myth Buster", + "Trend Analyst", + "Futurist", + "Negotiator", + "Community Leader", + "Voice of Reason", + "Conflict Resolver", + "Emotional Support", + "Pragmatist", + "Idealist", + "Realist", + "Satirist", + "Story Analyst", + "Language Expert", + "Historical Witness", + "Survivor", + "Inspirational Figure", + "Cultural Ambassador", + "Digital Nomad", + "Remote Correspondent", + "Field Reporter", + "Data Scientist", + "Gamer", + "Musician", + "Filmmaker", +] + +# Engagement Techniques +engagement_techniques = [ + "Rhetorical Questions", + "Anecdotes", + "Analogies", + "Humor", + "Metaphors", + "Storytelling", + "Quizzes", + "Polls", + "Contests/Giveaways", + "Guest Appearances", + "Sound Effects", + "Music Interludes", + "Shout-outs", + "Interactive Challenges", + "Personal Testimonials", + "Quotes", + "Jokes", + "Surprise Elements", + "Emotional Appeals", + "Provocative Statements", + "Irony", + "Sarcasm", + "Alliteration", + "Repetition", + "Foreshadowing", + "Cliffhangers", + "Audience Participation", + "Sensory Descriptions", + "Visual Aids (if applicable)", + "Callbacks to Earlier Points", + "Pop Culture References", + "Hyperbole", + "Parables", + "Thought Experiments", + "Puzzles and Riddles", + "Role-playing", + "Mock Scenarios", + "Debates", + "Sound Bites", + "Catchphrases", + "Voice Modulation", + "Interactive Games", + "Live Demos", + "Behind-the-Scenes Insights", + "Vivid Imagery", + "Statistics and Facts", + "Open-ended Questions", + "Challenges to Assumptions", + "Evoking Curiosity", + "Memes (if visual components are included)", + "Surveys", + "Testimonials", + "Provocations", +] diff --git a/pages/5_🎙️_Podcasts.py b/pages/5_🎙️_Podcasts.py new file mode 100644 index 0000000..dc2ee40 --- /dev/null +++ b/pages/5_🎙️_Podcasts.py @@ -0,0 +1,150 @@ +import streamlit as st +from streamlit_tags import st_tags + +from open_notebook.plugins.podcasts import ( + PodcastConfig, + PodcastEpisode, + conversation_styles, + dialogue_structures, + engagement_techniques, + participant_roles, +) + +episodes_tab, templates_tab = st.tabs(["Episodes", "Templates"]) + +with episodes_tab: + episodes = PodcastEpisode.get_all() + for episode in episodes: + st.json(episode.model_dump()) + else: + st.write("No episodes yet") +with templates_tab: + st.subheader("Podcast Templates") + st.markdown("") + with st.expander("**Create new Template**"): + pd_cfg = {} + pd_cfg["name"] = st.text_input("Template Name") + pd_cfg["podcast_name"] = st.text_input("Podcast Name") + pd_cfg["podcast_tagline"] = st.text_input("Podcast Tagline") + pd_cfg["output_language"] = st.text_input("Language", value="English") + pd_cfg["person1_role"] = st.text_input("Person 1 role") + st.caption(f"Suggestions:{', '.join(participant_roles)}") + pd_cfg["person2_role"] = st.text_input("Person 2 role") + pd_cfg["conversation_style"] = st_tags( + ["a"], conversation_styles, "Conversation Style" + ) + st.caption(f"Suggestions:{', '.join(conversation_styles)}") + pd_cfg["engagement_technique"] = st_tags( + [], engagement_techniques, "Engagement Techniques" + ) + st.caption(f"Suggestions:{', '.join(engagement_techniques)}") + pd_cfg["dialogue_structure"] = st_tags( + [], dialogue_structures, "Dialogue Structure" + ) + st.caption(f"Suggestions:{', '.join(dialogue_structures)}") + pd_cfg["wordcount"] = st.slider( + "Word Count", min_value=400, max_value=6000, step=50 + ) + pd_cfg["creativity"] = st.slider( + "Creativity", min_value=0.0, max_value=1.0, step=0.05 + ) + pd_cfg["provider"] = st.selectbox("Provider", ["openai", "elevenlabs", "edge"]) + pd_cfg["voice1"] = st.text_input("Voice 1") + pd_cfg["voice2"] = st.text_input("Voice 2") + pd_cfg["model"] = st.text_input("Model") + if st.button("Save"): + pd = PodcastConfig(**pd_cfg) + pd.save() + st.success("Saved") + + for pd_config in PodcastConfig.get_all(): + with st.expander(pd_config.name): + pd_config.name = st.text_input( + "Template Name", value=pd_config.name, key=f"name_{pd_config.id}" + ) + pd_config.podcast_name = st.text_input( + "Podcast Name", + value=pd_config.podcast_name, + key=f"podcast_name_{pd_config.id}", + ) + pd_config.podcast_tagline = st.text_input( + "Podcast Tagline", + value=pd_config.podcast_tagline, + key=f"podcast_tagline_{pd_config.id}", + ) + pd_config.output_language = st.text_input( + "Language", + value=pd_config.output_language, + key=f"output_language_{pd_config.id}", + ) + pd_config.person1_role = st.text_input( + "Person 1 role", + value=pd_config.person1_role, + key=f"person1_role_{pd_config.id}", + ) + st.caption(f"Suggestions:{', '.join(participant_roles)}") + pd_config.person2_role = st.text_input( + "Person 2 role", + value=pd_config.person2_role, + key=f"person2_role_{pd_config.id}", + ) + pd_config.conversation_style = st_tags( + pd_config.conversation_style, + conversation_styles, + "Conversation Style", + key=f"conversation_style_{pd_config.id}", + ) + st.caption(f"Suggestions:{', '.join(conversation_styles)}") + pd_config.engagement_technique = st_tags( + pd_config.engagement_technique, + engagement_techniques, + "Engagement Techniques", + key=f"engagement_technique_{pd_config.id}", + ) + st.caption(f"Suggestions:{', '.join(engagement_techniques)}") + pd_config.dialogue_structure = st_tags( + pd_config.dialogue_structure, + dialogue_structures, + "Dialogue Structure", + key=f"dialogue_structure_{pd_config.id}", + ) + st.caption(f"Suggestions:{', '.join(dialogue_structures)}") + pd_config.wordcount = st.slider( + "Word Count", + min_value=400, + max_value=6000, + step=50, + value=pd_config.wordcount, + key=f"wordcount_{pd_config.id}", + ) + pd_config.creativity = st.slider( + "Creativity", + min_value=0.0, + max_value=1.0, + step=0.05, + value=pd_config.creativity, + key=f"creativity_{pd_config.id}", + ) + pd_config.provider = st.selectbox( + "Provider", + ["openai", "elevenlabs", "edge"], + index=["openai", "elevenlabs", "edge"].index(pd_config.provider), + key=f"provider_{pd_config.id}", + ) + pd_config.voice1 = st.text_input( + "Voice 1", value=pd_config.voice1, key=f"voice1_{pd_config.id}" + ) + pd_config.voice2 = st.text_input( + "Voice 2", value=pd_config.voice2, key=f"voice2_{pd_config.id}" + ) + pd_config.model = st.text_input( + "Model", value=pd_config.model, key=f"model_{pd_config.id}" + ) + + if st.button("Save Config", key=f"btn_save{pd_config.id}"): + pd_config.save() + st.rerun() + + if st.button("Delete Config", key=f"btn_delete{pd_config.id}"): + pd_config.delete() + st.rerun() diff --git a/stream_app/chat.py b/stream_app/chat.py index 0c68b41..6e19d03 100644 --- a/stream_app/chat.py +++ b/stream_app/chat.py @@ -3,6 +3,7 @@ from langchain_core.runnables import RunnableConfig from open_notebook.domain import Note, Source from open_notebook.graphs.chat import graph as chat_graph +from open_notebook.plugins.podcasts import PodcastConfig, PodcastEpisode from open_notebook.utils import token_count @@ -52,38 +53,59 @@ def execute_chat(txt_input, session_id): return result +podcast_configs = PodcastConfig.get_all() +podcast_config_names = [pd.name for pd in podcast_configs] + + # todo: se eu for usar o token count, preciso deixar configuravel # seria bom ter um total de tokens no admin em algum lugar def chat_sidebar(session_id): context = build_context(session_id=session_id) tokens = token_count(str(context) + str(st.session_state[session_id]["messages"])) - with st.container(border=True): - request = st.chat_input("Enter your question") - # removing for now since it's not multi-model capable right now - st.caption(f"Total tokens: {tokens}") - if request: - response = execute_chat(txt_input=request, session_id=session_id) - st.session_state[session_id]["messages"] = response["messages"] + chat_tab, podcast_tab = st.tabs(["Chat", "Podcast"]) + with podcast_tab: + with st.container(border=True): + template = st.selectbox("Pick a template", podcast_config_names) + episode_name = st.text_input("Episode Name") + instructions = st.text_area("Instructions") + if st.button("Generate"): + epi = PodcastEpisode( + name=episode_name, + instructions=instructions, + template=template, + file_path="lallaa", + ) + epi.save() + st.page_link("pages/5_🎙️_Podcasts.py", label="Go to Config") + st.divider() + with chat_tab: + with st.container(border=True): + request = st.chat_input("Enter your question") + # removing for now since it's not multi-model capable right now + st.caption(f"Total tokens: {tokens}") + if request: + response = execute_chat(txt_input=request, session_id=session_id) + st.session_state[session_id]["messages"] = response["messages"] - for msg in st.session_state[session_id]["messages"][::-1]: - if msg.type not in ["human", "ai"]: - continue - if not msg.content: - continue + for msg in st.session_state[session_id]["messages"][::-1]: + if msg.type not in ["human", "ai"]: + continue + if not msg.content: + continue - with st.chat_message(name=msg.type): - st.write(msg.content) - if msg.type == "ai": - if st.button("💾 New Note", key=f"render_save_{msg.id}"): - title = "New Note" - content = msg.content - note = Note( - title=title, - content=content, - note_type="ai", - ) - note.save() - note.add_to_notebook( - st.session_state[session_id]["notebook"].id - ) - st.rerun() + with st.chat_message(name=msg.type): + st.write(msg.content) + if msg.type == "ai": + if st.button("💾 New Note", key=f"render_save_{msg.id}"): + title = "New Note" + content = msg.content + note = Note( + title=title, + content=content, + note_type="ai", + ) + note.save() + note.add_to_notebook( + st.session_state[session_id]["notebook"].id + ) + st.rerun()