add podcast support

This commit is contained in:
LUIS NOVO 2024-10-26 05:17:58 -03:00
parent 39fe547e61
commit 01f8eab10e
19 changed files with 1386 additions and 247 deletions

View file

@ -7,4 +7,6 @@ data/
.env
sqlite-db/
temp/
google-credentials.json
google-credentials.json
docker-compose*
.docker_data

View file

@ -14,9 +14,15 @@ OPENAI_API_KEY=
# EXAMPLE - anthropic/claude-3-5-sonnet-20240620
ANTHROPIC_API_KEY=
# GEMINI
# USE MODEL NAMES AS "gemini/<modelname>"
# EXAMPLE - gemini/gemini-1.5-pro-002
GEMINI_API_KEY=
# VERTEXAI
# USE MODEL NAMES AS "vertexai/<modelname>"
# EXAMPLE - vertexai/gemini-1.5-pro-001
# EXAMPLE - vertexai/gemini-1.5-pro-002
VERTEX_PROJECT=my-google-cloud-project-name
GOOGLE_APPLICATION_CREDENTIALS=./google-credentials.json
@ -31,6 +37,11 @@ OLLAMA_API_BASE="http://10.20.30.20:11434"
OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
OPENROUTER_API_KEY=
# ELEVENLABS
# Used only by the podcast feature
ELEVENLABS_API_KEY=
# USE THIS IF YOU WANT TO DEBUG THE APP ON LANGSMITH
# LANGCHAIN_TRACING_V2=true
# LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"

View file

@ -4,7 +4,7 @@ FROM python:3.11.7-slim-bullseye
# Install system dependencies required for building certain Python packages
RUN apt-get update && apt-get install -y \
gcc \
curl wget libmagic-dev \
curl wget libmagic-dev ffmpeg \
&& rm -rf /var/lib/apt/lists/*
# Set the working directory in the container to /app
@ -23,6 +23,8 @@ WORKDIR /app
EXPOSE 8502
RUN mkdir -p /app/sqlite-db
RUN mkdir -p /data
RUN mkdir -p /surrealdb-data
CMD ["poetry", "run", "streamlit", "run", "app_home.py"]

View file

@ -10,17 +10,7 @@ Open Notebook empowers you to manage your research, generate AI-assisted notes,
Go to the [Setup Guide](docs/SETUP.md) to learn how to set up the tool in details.
But, the gist of it is:
```sh
git clone https://github.com/lfnovo/open_notebook.git
cd open_notebook
cp .env.sample .env
poetry install
poetry run streamlit run app_home.py
```
or with Docker/Portainer:
To setup with Docker/Portainer:
```yaml
version: '3'
@ -52,7 +42,6 @@ volumes:
```
## Usage Instructions
Go to the [Usage](docs/USAGE.md) page to learn how to use all features.
@ -68,10 +57,16 @@ Go to the [Usage](docs/USAGE.md) page to learn how to use all features.
- **Recursive Summarization**: Tackle large content by recursively summarizing it.
- **Integrated Search Engines**: Built-in full-text and vector search for faster information retrieval.
- **Fine-Grained Context Management**: Choose exactly what to share with the AI to maintain control.
- **Cost Estimation**: Estimate costs for large context processing to keep budget control in check.
- **Podcast Generator**: Automatically convert your notes into a podcast format.
## 🚀 New Features
### v0.0.4 - Podcasts 🎙️
You can now build amazing custom podcasts based on your own data. Customize your speakers, episode structure, cadence, voices, etc.
Head to the [Podcasts](docs/PODCASTS.md) page for more info
### v0.0.3 - Transformations ✨
We just release a much more powerful way to create more value from your sources.
@ -114,19 +109,18 @@ Locate anything across your research with ease using full-text and vector-based
Jinja based prompts that are easy to customize to your own preferences.
## 🌟 Coming Soon
## 🌟 Roadmap
- **Podcast Generator**: Automatically convert your notes into a podcast format.
- **Enhanced Citations**: Improved layout and finer control for citations.
- **Better Embeddings & Summarization**: Smarter ways to distill information.
- **Multiple Chat Sessions**: Juggle different discussions within the same notebook.
- **Live Front-End Updates**: Real-time UI updates for a smoother experience.
- **Async Processing**: Faster UI through asynchronous content processing.
- **Improved Error Handling**: Making everything more robust.
- **Cross-Notebook Sources and Notes**: Reuse research notes across projects.
- **Bookmark Integration**: Integrate with your favorite bookmarking app.
- **Multi-model support**: Open AI, Anthropic, Vertex AI, Open Router, Ollama, etc. ✅ 0.0.2
- **Insight Generation**: New tools for creating insights - [transformations](docs/TRANSFORMATIONS.md) ✅ 0.0.3
- **Podcast Generator**: Automatically convert your notes into a podcast format. ✅ 0.0.4
## 💻 Tech Stack
@ -134,6 +128,7 @@ Jinja based prompts that are easy to customize to your own preferences.
- **Streamlit**: For the front-end (Looking to move out of Streamlit. Contributors welcome!).
- **SurrealDB**: Fast, scalable database solution.
- **Langchain/Langgraph**: The backbone for LLM interactions.
- **Podcastfy**: For generating podcasts from your notes.
## 🙌 Help Wanted
@ -151,3 +146,9 @@ Open Notebook is MIT licensed. See the [LICENSE](LICENSE) file for details.
---
Your contributions, feature requests, and bug reports are always welcome. Let's build a research tool that respects our privacy and makes learning truly open for everyone. ✨
---
This project uses the following third-party libraries:
- [Podcastfy](https://github.com/souzatharsis/podcastfy) - Licensed under the Apache License 2.0

View file

@ -24,4 +24,4 @@ except InvalidDatabaseSchema as e:
st.rerun()
except Exception as e:
st.error(e)
st.stop()
st.stop()

View file

@ -76,5 +76,7 @@ DEFINE EVENT IF NOT EXISTS source_delete ON TABLE source WHEN ($after == NONE) T
delete source_insight where source == $before.id;
};
DEFINE TABLE IF NOT EXISTS podcast_config SCHEMALESS;
UPDATE open_notebook:database_info SET
version= "0.0.2";

View file

@ -6,7 +6,7 @@ services:
ports:
- "8000:8000"
volumes:
- ./surreal-data:/mydata
- ./.docker_data/surreal-data:/mydata
user: "${UID}:${GID}"
command: start --log trace --user root --pass root rocksdb:mydatabase.db
pull_policy: always
@ -17,6 +17,8 @@ services:
ports:
- "8080:8502"
volumes:
- ./.docker_data/data:/app/data
- ./docker.env:/app/.env
- ./google-credentials.json:/app/google-credentials.json
depends_on:
- surrealdb

25
docs/PODCASTS.md Normal file
View file

@ -0,0 +1,25 @@
# Poscasts
**For the audio learners**
The podcast feature made Google Notebook LM famous and for a good reason. It can make your learning process 10x more enjoyable and efficient by processing a lot of insights in a way that is very easy to consume.
### Define Templates
Setup a template for your podcast, define the role of the speakers, the format of the podcast, and the length of each episode.
![Podcast Temmplates](assets/podcast_template.png)
### Pick your context
Pick the context for your podcast. You can use your own notes and assets.
![Context](assets/context.png)
### Generate your podcast
![Context](assets/podcast.png)
### Manage, Listen to and Download your episodes
![Context](assets/podcast_listen.png)

View file

@ -49,6 +49,14 @@ Or by turning any LLM message into a Note.
![New Notebook](assets/ai_note.png)
## Generate your podcasts
Once you have your content ready, start creating beautiful podcast episodes from it.
![Context](assets/podcast_listen.png)
See more at the [Podcasts](PODCASTS.md) section.
## Searching
The search page gives you a glance of all the notes you have made and the sources you have added. You can query the database both by keyword as well as using the vector search.

BIN
docs/assets/podcast.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 288 KiB

View file

@ -40,7 +40,7 @@ class ObjectModel(BaseModel):
except Exception as e:
logger.error(f"Error fetching all {cls.table_name}: {str(e)}")
logger.exception(e)
raise DatabaseOperationError(f"Failed to fetch all {cls.table_name}")
raise DatabaseOperationError(e)
@classmethod
def get(cls: Type[T], id: str) -> Optional[T]:
@ -152,7 +152,7 @@ class Notebook(ObjectModel):
except Exception as e:
logger.error(f"Error fetching sources for notebook {self.id}: {str(e)}")
logger.exception(e)
raise DatabaseOperationError("Failed to fetch sources for notebook")
raise DatabaseOperationError(e)
@property
def notes(self) -> List["Note"]:
@ -171,7 +171,7 @@ class Notebook(ObjectModel):
except Exception as e:
logger.error(f"Error fetching notes for notebook {self.id}: {str(e)}")
logger.exception(e)
raise DatabaseOperationError("Failed to fetch notes for notebook")
raise DatabaseOperationError(e)
class Asset(BaseModel):

View file

@ -1,179 +0,0 @@
"""
Website Extractor Module
This module is responsible for extracting clean text content from websites using
BeautifulSoup for local HTML parsing instead of the Jina AI API.
"""
import html
import logging
import re
from typing import List
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from podcastfy.utils.config import load_config
logger = logging.getLogger(__name__)
class WebsiteExtractor:
def __init__(self):
"""
Initialize the WebsiteExtractor.
"""
self.config = load_config()
self.website_extractor_config = self.config.get("website_extractor", {})
self.unwanted_tags = self.website_extractor_config.get("unwanted_tags", [])
self.user_agent = self.website_extractor_config.get("user_agent", "Mozilla/5.0")
self.timeout = self.website_extractor_config.get("timeout", 10)
self.remove_patterns = self.website_extractor_config.get(
"markdown_cleaning", {}
).get("remove_patterns", [])
def extract_content(self, url: str) -> str:
"""
Extract clean text content from a website using BeautifulSoup.
Args:
url (str): Website URL.
Returns:
str: Extracted clean text content.
Raises:
Exception: If there's an error in extracting the content.
"""
try:
# Normalize the URL
normalized_url = self.normalize_url(url)
# Request the webpage
headers = {"User-Agent": self.user_agent}
response = requests.get(
normalized_url, headers=headers, timeout=self.timeout
)
response.raise_for_status() # Raise an exception for bad status codes
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Remove unwanted elements
self.remove_unwanted_elements(soup)
# Extract and clean the text content
raw_text = soup.get_text(separator="\n") # Get all text content
cleaned_content = self.clean_content(raw_text)
return cleaned_content
except requests.RequestException as e:
logger.error(f"Failed to extract content from {url}: {str(e)}")
raise Exception(f"Failed to extract content from {url}: {str(e)}")
except Exception as e:
logger.error(
f"An unexpected error occurred while extracting content from {url}: {str(e)}"
)
raise Exception(
f"An unexpected error occurred while extracting content from {url}: {str(e)}"
)
def normalize_url(self, url: str) -> str:
"""
Normalize the given URL by adding scheme if missing and ensuring it's a valid URL.
Args:
url (str): The URL to normalize.
Returns:
str: The normalized URL.
Raises:
ValueError: If the URL is invalid after normalization attempts.
"""
# If the URL doesn't start with a scheme, add 'https://'
if not url.startswith(("http://", "https://")):
url = "https://" + url
# Parse the URL
parsed = urlparse(url)
# Ensure the URL has a valid scheme and netloc
if not all([parsed.scheme, parsed.netloc]):
raise ValueError(f"Invalid URL: {url}")
return parsed.geturl()
def remove_unwanted_elements(self, soup: BeautifulSoup) -> None:
"""
Remove unwanted elements from the BeautifulSoup object.
Args:
soup (BeautifulSoup): The BeautifulSoup object to clean.
"""
for tag in self.unwanted_tags:
for element in soup.find_all(tag):
element.decompose()
def clean_content(self, content: str) -> str:
"""
Clean the extracted content by removing unnecessary whitespace and applying
custom cleaning patterns.
Args:
content (str): The content to clean.
Returns:
str: Cleaned text content.
"""
# Decode HTML entities
cleaned_content = html.unescape(content)
# Remove extra whitespace
cleaned_content = re.sub(r"\s+", " ", cleaned_content)
# Remove extra newlines
cleaned_content = re.sub(r"\n{3,}", "\n\n", cleaned_content)
# Apply custom cleaning patterns from config
for pattern in self.remove_patterns:
cleaned_content = re.sub(pattern, "", cleaned_content)
return cleaned_content.strip()
def main(seed: int = 42) -> None:
"""
Main function to test the WebsiteExtractor class.
"""
logging.basicConfig(level=logging.INFO)
# Create an instance of WebsiteExtractor
extractor = WebsiteExtractor()
# Test URLs
test_urls: List[str] = [
"www.souzatharsis.com",
"https://en.wikipedia.org/wiki/Web_scraping",
]
for url in test_urls:
try:
logger.info(f"Extracting content from: {url}")
content = extractor.extract_content(url)
# Print the first 500 characters of the extracted content
logger.info(
f"Extracted content (first 500 characters):\n{content[:500]}..."
)
# Print the total length of the extracted content
logger.info(f"Total length of extracted content: {len(content)} characters")
logger.info("-" * 50)
except Exception as e:
logger.error(f"An error occurred while processing {url}: {str(e)}")
if __name__ == "__main__":
main()

View file

@ -1,5 +1,7 @@
from typing import ClassVar, List, Literal
from loguru import logger
from podcastfy.client import generate_podcast
from pydantic import Field, field_validator
from open_notebook.domain import ObjectModel
@ -10,7 +12,8 @@ class PodcastEpisode(ObjectModel):
name: str
template: str
instructions: str
file_path: str
text: str
audio_file: str
class PodcastConfig(ObjectModel):
@ -24,6 +27,7 @@ class PodcastConfig(ObjectModel):
conversation_style: List[str]
engagement_technique: List[str]
dialogue_structure: List[str]
user_instructions: str
wordcount: int = Field(gt=500, lt=10000)
creativity: float = Field(ge=0, le=1)
provider: Literal["openai", "elevenlabs", "edge"] = Field(default="openai")
@ -31,6 +35,56 @@ class PodcastConfig(ObjectModel):
voice2: str
model: str
def generate_episode(self, episode_name, text, instructions=None):
self.user_instructions = (
instructions if instructions else self.user_instructions
)
conversation_config = {
"word_count": self.wordcount,
"conversation_style": self.conversation_style,
"roles_person1": self.person1_role,
"roles_person2": self.person2_role,
"dialogue_structure": self.dialogue_structure,
"podcast_name": self.podcast_name,
"podcast_tagline": self.podcast_tagline,
"output_language": self.output_language,
"user_instructions": self.user_instructions,
"engagement_techniques": self.engagement_technique,
"creativity": self.creativity,
"text_to_speech": {
# "temp_audio_dir": "./data/audio/tmp",
"ending_message": "Thank you for listening to this episode. Don't forget to subscribe to our podcast for more interesting conversations.",
"default_tts_model": self.provider,
self.provider: {
"default_voices": {
"question": self.voice1,
"answer": self.voice2,
},
"model": self.model,
},
"audio_format": "mp3",
},
}
logger.error(conversation_config)
# conversation_config = {}
logger.debug(
f"Generating episode {episode_name} with config {conversation_config}"
)
audio_file = generate_podcast(
conversation_config=conversation_config, text=text, tts_model=self.provider
)
logger.warning(audio_file)
episode = PodcastEpisode(
name=episode_name,
template=self.name,
instructions=instructions,
text=str(text),
audio_file=audio_file,
)
episode.save()
@field_validator("wordcount")
def validate_wordcount(cls, value):
if not 500 <= value <= 6000:

View file

@ -15,8 +15,22 @@ episodes_tab, templates_tab = st.tabs(["Episodes", "Templates"])
with episodes_tab:
episodes = PodcastEpisode.get_all()
for episode in episodes:
st.json(episode.model_dump())
else:
with st.container(border=True):
episode_name = episode.name if episode.name else "No Name"
st.markdown(f"**{episode.template} - {episode_name}**")
# st.caption(naturaltime(episode.created))
st.write(f"Instructions: {episode.instructions}")
try:
st.audio(episode.audio_file, format="audio/mpeg", loop=True)
except Exception as e:
st.write("No audio file found")
st.error(e)
with st.expander("Source Content"):
st.code(episode.text)
if st.button("Delete Episode", key=f"btn_delete{episode.id}"):
episode.delete()
st.rerun()
if len(episodes) == 0:
st.write("No episodes yet")
with templates_tab:
st.subheader("Podcast Templates")
@ -27,6 +41,10 @@ with templates_tab:
pd_cfg["podcast_name"] = st.text_input("Podcast Name")
pd_cfg["podcast_tagline"] = st.text_input("Podcast Tagline")
pd_cfg["output_language"] = st.text_input("Language", value="English")
pd_cfg["user_instructions"] = st.text_input(
"User Instructions",
help="Any additional intructions to pass to the LLM that will generate the transcript",
)
pd_cfg["person1_role"] = st.text_input("Person 1 role")
st.caption(f"Suggestions:{', '.join(participant_roles)}")
pd_cfg["person2_role"] = st.text_input("Person 2 role")
@ -49,13 +67,18 @@ with templates_tab:
"Creativity", min_value=0.0, max_value=1.0, step=0.05
)
pd_cfg["provider"] = st.selectbox("Provider", ["openai", "elevenlabs", "edge"])
pd_cfg["voice1"] = st.text_input("Voice 1")
pd_cfg["voice2"] = st.text_input("Voice 2")
pd_cfg["voice1"] = st.text_input(
"Voice 1", help="You can use Elevenlabs voice ID"
)
pd_cfg["voice2"] = st.text_input(
"Voice 2", help="You can use Elevenlabs voice ID"
)
pd_cfg["model"] = st.text_input("Model")
if st.button("Save"):
pd = PodcastConfig(**pd_cfg)
pd_cfg = {}
pd.save()
st.success("Saved")
st.rerun()
for pd_config in PodcastConfig.get_all():
with st.expander(pd_config.name):
@ -72,6 +95,13 @@ with templates_tab:
value=pd_config.podcast_tagline,
key=f"podcast_tagline_{pd_config.id}",
)
pd_config.user_instructions = st.text_input(
"User Instructions",
value=pd_config.user_instructions,
help="Any additional intructions to pass to the LLM that will generate the transcript",
key=f"user_instructions_{pd_config.id}",
)
pd_config.output_language = st.text_input(
"Language",
value=pd_config.output_language,
@ -132,10 +162,16 @@ with templates_tab:
key=f"provider_{pd_config.id}",
)
pd_config.voice1 = st.text_input(
"Voice 1", value=pd_config.voice1, key=f"voice1_{pd_config.id}"
"Voice 1",
value=pd_config.voice1,
key=f"voice1_{pd_config.id}",
help="You can use Elevenlabs voice ID",
)
pd_config.voice2 = st.text_input(
"Voice 2", value=pd_config.voice2, key=f"voice2_{pd_config.id}"
"Voice 2",
value=pd_config.voice2,
key=f"voice2_{pd_config.id}",
help="You can use Elevenlabs voice ID",
)
pd_config.model = st.text_input(
"Model", value=pd_config.model, key=f"model_{pd_config.id}"
@ -145,6 +181,12 @@ with templates_tab:
pd_config.save()
st.rerun()
if st.button("Duplicate Config", key=f"btn_duplicate{pd_config.id}"):
pd_config.name = f"{pd_config.name} - Copy"
pd_config.id = None
pd_config.save()
st.rerun()
if st.button("Delete Config", key=f"btn_delete{pd_config.id}"):
pd_config.delete()
st.rerun()

1180
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "open-notebook"
version = "0.0.3"
version = "0.0.4"
description = "An open source implementation of a research assistant, inspired by Google Notebook LM"
authors = ["Luis Novo <lfnovo@gmail.com>"]
license = "MIT"
@ -39,6 +39,7 @@ langchain-anthropic = "^0.2.3"
langchain-ollama = "^0.2.0"
langchain-google-vertexai = "^2.0.5"
sdblpy = "^0.3.0"
podcastfy = "^0.2.8"
[tool.poetry.group.dev.dependencies]
ipykernel = "^6.29.5"

View file

@ -3,7 +3,7 @@ from langchain_core.runnables import RunnableConfig
from open_notebook.domain import Note, Source
from open_notebook.graphs.chat import graph as chat_graph
from open_notebook.plugins.podcasts import PodcastConfig, PodcastEpisode
from open_notebook.plugins.podcasts import PodcastConfig
from open_notebook.utils import token_count
from stream_app.note import make_note_from_chat
@ -54,30 +54,38 @@ def execute_chat(txt_input, session_id):
return result
podcast_configs = PodcastConfig.get_all()
podcast_config_names = [pd.name for pd in podcast_configs]
# todo: se eu for usar o token count, preciso deixar configuravel
# seria bom ter um total de tokens no admin em algum lugar
def chat_sidebar(session_id):
context = build_context(session_id=session_id)
tokens = token_count(str(context) + str(st.session_state[session_id]["messages"]))
chat_tab, podcast_tab = st.tabs(["Chat", "Podcast"])
with st.expander(f"Context ({tokens} tokens), {len(str(context))} chars"):
st.json(context)
with podcast_tab:
with st.container(border=True):
template = st.selectbox("Pick a template", podcast_config_names)
episode_name = st.text_input("Episode Name")
instructions = st.text_area("Instructions")
if st.button("Generate"):
epi = PodcastEpisode(
name=episode_name,
instructions=instructions,
template=template,
file_path="lallaa",
podcast_configs = PodcastConfig.get_all()
podcast_config_names = [pd.name for pd in podcast_configs]
if len(podcast_configs) == 0:
st.warning("No podcast configurations found")
else:
template = st.selectbox("Pick a template", podcast_config_names)
selected_template = next(
filter(lambda x: x.name == template, podcast_configs)
)
epi.save()
st.page_link("pages/5_🎙_Podcasts.py", label="Go to Config")
episode_name = st.text_input("Episode Name")
instructions = st.text_area(
"Instructions", value=selected_template.user_instructions
)
if st.button("Generate"):
with st.spinner("Go grab a coffee, almost here..."):
selected_template.generate_episode(
episode_name=episode_name,
text=context,
instructions=instructions,
)
st.success("Episode generated successfully")
st.page_link("pages/5_🎙_Podcasts.py", label="Go to Podcasts")
st.divider()
with chat_tab:
with st.container(border=True):
@ -94,12 +102,12 @@ def chat_sidebar(session_id):
if not msg.content:
continue
with st.chat_message(name=msg.type):
st.write(msg.content)
if msg.type == "ai":
if st.button("💾 New Note", key=f"render_save_{msg.id}"):
make_note_from_chat(
content=msg.content,
notebook_id=st.session_state[session_id]["notebook"].id,
)
st.rerun()
with st.chat_message(name=msg.type):
st.write(msg.content)
if msg.type == "ai":
if st.button("💾 New Note", key=f"render_save_{msg.id}"):
make_note_from_chat(
content=msg.content,
notebook_id=st.session_state[session_id]["notebook"].id,
)
st.rerun()