feat: add Gemini image generation support

This commit is contained in:
ystyleb 2026-04-01 13:05:13 +08:00
parent 8db2c1994a
commit 6e0ff85f30
5 changed files with 115 additions and 18 deletions

View file

@ -192,7 +192,7 @@ wewrite/
│ ├── theme.py # YAML 主题引擎
│ ├── publisher.py # 微信草稿箱 API + 小绿书图片帖
│ ├── wechat_api.py # access_token / 图片上传
│ ├── image_gen.py # AI 图片生成doubao / OpenAI
│ ├── image_gen.py # AI 图片生成doubao / OpenAI / Gemini
│ └── themes/ # 16+ 排版主题(含暗黑模式,可从文章学习新增)
├── personas/ # 5 套写作人格预设(含朱雀实测数据)

View file

@ -9,7 +9,7 @@ wechat:
# AI 图片生成
image:
# 可选 provider: doubao | openai
# 可选 provider: doubao | openai | gemini
provider: "doubao"
api_key: "your_api_key"
@ -24,5 +24,11 @@ image:
# model: "dall-e-3"
# base_url: "https://api.openai.com/v1"
# Google Gemini Imagen
# provider: "gemini"
# api_key: "AIza..."
# 获取 API key: https://aistudio.google.com/apikey
# model: "gemini-3.1-flash-image-preview"
# 默认排版主题
theme: "professional-clean"

View file

@ -9,7 +9,7 @@ wechat:
# AI 图片生成
image:
# 可选 provider: doubao | openai
# 可选 provider: doubao | openai | gemini
provider: "doubao"
api_key: "your_api_key"
@ -24,5 +24,11 @@ image:
# model: "dall-e-3"
# base_url: "https://api.openai.com/v1"
# Google Gemini Imagen
# provider: "gemini"
# api_key: "AIza..."
# 获取 API key: https://aistudio.google.com/apikey
# model: "gemini-3.1-flash-image-preview"
# 默认排版主题
theme: "professional-clean"

View file

@ -5,12 +5,13 @@ AI image generation module for WeWrite.
Supports multiple providers via a simple abstraction:
- doubao-seedream (Volcengine Ark) default, good for Chinese prompts
- openai (DALL-E 3) broad availability
- gemini (Google Gemini Imagen) multimodal image generation
- Custom providers via ImageProvider base class
Usage as CLI:
python3 image_gen.py --prompt "描述" --output cover.png
python3 image_gen.py --prompt "描述" --output cover.png --size cover
python3 image_gen.py --prompt "描述" --output cover.png --provider openai
python3 image_gen.py --prompt "描述" --output cover.png --provider gemini
Usage as module:
from image_gen import generate_image
@ -19,6 +20,7 @@ Usage as module:
import abc
import argparse
import base64
import json
import sys
from pathlib import Path
@ -50,10 +52,10 @@ def _load_config() -> dict:
# Article: 16:9 横版内文配图
# Vertical: 9:16 竖版
SIZE_PRESETS = {
"cover": {"doubao": "2952x1256", "openai": "1792x1024"},
"article": {"doubao": "2560x1440", "openai": "1792x1024"},
"vertical": {"doubao": "1088x2560", "openai": "1024x1792"},
"square": {"doubao": "2048x2048", "openai": "1024x1024"},
"cover": {"doubao": "2952x1256", "openai": "1792x1024", "gemini": "1792x1024"},
"article": {"doubao": "2560x1440", "openai": "1792x1024", "gemini": "1792x1024"},
"vertical": {"doubao": "1088x2560", "openai": "1024x1792", "gemini": "1024x1792"},
"square": {"doubao": "2048x2048", "openai": "1024x1024", "gemini": "1024x1024"},
}
MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB
@ -211,14 +213,54 @@ class OpenAIProvider(ImageProvider):
return img_resp.content
class GeminiProvider(ImageProvider):
"""Google Gemini Imagen provider."""
provider_key = "gemini"
def __init__(self, api_key: str, model: str = "gemini-3.1-flash-image-preview",
base_url: str = "https://generativelanguage.googleapis.com/v1beta"):
self._api_key = api_key
self._model = model
self._base_url = base_url
def generate(self, prompt: str, size: str) -> bytes:
body = {
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {"responseModalities": ["TEXT", "IMAGE"]},
}
session = requests.Session()
session.trust_env = False
resp = session.post(
f"{self._base_url}/models/{self._model}:generateContent?key={self._api_key}",
headers={"Content-Type": "application/json"},
json=body,
timeout=120,
)
data = resp.json()
if resp.status_code != 200:
error = data.get("error", {})
msg = error.get("message", json.dumps(data, ensure_ascii=False))
raise ValueError(f"Gemini API error ({resp.status_code}): {msg}")
candidates = data.get("candidates", [])
if not candidates:
raise ValueError(f"No candidates in Gemini response")
parts = candidates[0].get("content", {}).get("parts", [])
for part in parts:
inline_data = part.get("inlineData")
if inline_data and inline_data.get("mimeType", "").startswith("image/"):
return base64.b64decode(inline_data["data"])
raise ValueError(f"No image found in Gemini response parts")
# --- Provider registry ---
PROVIDERS = {
"doubao": DoubaoProvider,
"openai": OpenAIProvider,
"gemini": GeminiProvider,
}
def _build_provider(config: dict) -> ImageProvider:
"""Build an ImageProvider from config.yaml's image section."""
img_cfg = config.get("image", {})
@ -287,7 +329,7 @@ def generate_image(
def main():
parser = argparse.ArgumentParser(
description="Generate images using AI (doubao-seedream, OpenAI DALL-E, etc.)"
description="Generate images using AI (doubao-seedream, OpenAI DALL-E, Gemini Imagen, etc.)"
)
parser.add_argument("--prompt", required=True, help="Image generation prompt")
parser.add_argument("--output", required=True, help="Output file path")
@ -299,7 +341,7 @@ def main():
parser.add_argument(
"--provider",
default=None,
help="Override provider (doubao, openai). Default: from config.yaml",
help="Override provider (doubao, openai, gemini). Default: from config.yaml",
)
args = parser.parse_args()

View file

@ -5,12 +5,13 @@ AI image generation module for WeWrite.
Supports multiple providers via a simple abstraction:
- doubao-seedream (Volcengine Ark) default, good for Chinese prompts
- openai (DALL-E 3) broad availability
- gemini (Google Gemini Imagen) multimodal image generation
- Custom providers via ImageProvider base class
Usage as CLI:
python3 image_gen.py --prompt "描述" --output cover.png
python3 image_gen.py --prompt "描述" --output cover.png --size cover
python3 image_gen.py --prompt "描述" --output cover.png --provider openai
python3 image_gen.py --prompt "描述" --output cover.png --provider gemini
Usage as module:
from image_gen import generate_image
@ -19,6 +20,7 @@ Usage as module:
import abc
import argparse
import base64
import json
import sys
from pathlib import Path
@ -50,10 +52,10 @@ def _load_config() -> dict:
# Article: 16:9 横版内文配图
# Vertical: 9:16 竖版
SIZE_PRESETS = {
"cover": {"doubao": "2952x1256", "openai": "1792x1024"},
"article": {"doubao": "2560x1440", "openai": "1792x1024"},
"vertical": {"doubao": "1088x2560", "openai": "1024x1792"},
"square": {"doubao": "2048x2048", "openai": "1024x1024"},
"cover": {"doubao": "2952x1256", "openai": "1792x1024", "gemini": "1792x1024"},
"article": {"doubao": "2560x1440", "openai": "1792x1024", "gemini": "1792x1024"},
"vertical": {"doubao": "1088x2560", "openai": "1024x1792", "gemini": "1024x1792"},
"square": {"doubao": "2048x2048", "openai": "1024x1024", "gemini": "1024x1024"},
}
MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB
@ -211,11 +213,52 @@ class OpenAIProvider(ImageProvider):
return img_resp.content
class GeminiProvider(ImageProvider):
"""Google Gemini Imagen provider."""
provider_key = "gemini"
def __init__(self, api_key: str, model: str = "gemini-3.1-flash-image-preview",
base_url: str = "https://generativelanguage.googleapis.com/v1beta"):
self._api_key = api_key
self._model = model
self._base_url = base_url
def generate(self, prompt: str, size: str) -> bytes:
body = {
"contents": [{"parts": [{"text": prompt}]}],
"generationConfig": {"responseModalities": ["TEXT", "IMAGE"]},
}
session = requests.Session()
session.trust_env = False
resp = session.post(
f"{self._base_url}/models/{self._model}:generateContent?key={self._api_key}",
headers={"Content-Type": "application/json"},
json=body,
timeout=120,
)
data = resp.json()
if resp.status_code != 200:
error = data.get("error", {})
msg = error.get("message", json.dumps(data, ensure_ascii=False))
raise ValueError(f"Gemini API error ({resp.status_code}): {msg}")
candidates = data.get("candidates", [])
if not candidates:
raise ValueError(f"No candidates in Gemini response")
parts = candidates[0].get("content", {}).get("parts", [])
for part in parts:
inline_data = part.get("inlineData")
if inline_data and inline_data.get("mimeType", "").startswith("image/"):
return base64.b64decode(inline_data["data"])
raise ValueError(f"No image found in Gemini response parts")
# --- Provider registry ---
PROVIDERS = {
"doubao": DoubaoProvider,
"openai": OpenAIProvider,
"gemini": GeminiProvider,
}
@ -287,7 +330,7 @@ def generate_image(
def main():
parser = argparse.ArgumentParser(
description="Generate images using AI (doubao-seedream, OpenAI DALL-E, etc.)"
description="Generate images using AI (doubao-seedream, OpenAI DALL-E, Gemini Imagen, etc.)"
)
parser.add_argument("--prompt", required=True, help="Image generation prompt")
parser.add_argument("--output", required=True, help="Output file path")
@ -299,7 +342,7 @@ def main():
parser.add_argument(
"--provider",
default=None,
help="Override provider (doubao, openai). Default: from config.yaml",
help="Override provider (doubao, openai, gemini). Default: from config.yaml",
)
args = parser.parse_args()