Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ license = { file = "LICENSE" }

dependencies = [
"fastapi>=0.109.0",
"python-multipart>=0.0.9",
"uvicorn>=0.27.0",
"pydantic>=2.6.0",
"pydantic-settings>=2.1.0",
Expand Down
45 changes: 35 additions & 10 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,13 @@ async def scrape_chat_link(req: ScrapeRequest):
elapsed = round((time.perf_counter() - start) * 1000, 2)

if not pairs:
from src.api.chat_share import scrape_failure_message

return JSONResponse(
{
"status": "error",
"data": None,
"error": "Failed to extract messages from the provided link.",
"error": scrape_failure_message(result),
"elapsed_ms": elapsed,
},
status_code=400,
Expand Down Expand Up @@ -554,7 +556,7 @@ def _render_chat_share_sync(url: str) -> tuple[str, str]:
with sync_playwright() as p:
browser = None
launch_errors = []
for channel in (None, "msedge", "chrome"):
for channel in ("chromium", None, "msedge", "chrome"):
try:
kwargs = {"headless": True}
if channel:
Expand Down Expand Up @@ -591,23 +593,23 @@ def _block_heavy_assets(route):
page.route("**/*", _block_heavy_assets)

try:
page.goto(url, wait_until="networkidle", timeout=20000)
page.goto(url, wait_until="domcontentloaded", timeout=30000)
except Exception as exc:
print(f"[scrape] navigation warning: {exc}", flush=True)

provider = _detect_chat_provider(page.url or url)
selector = {
"chatgpt": "div[data-message-author-role]",
"claude": "script",
"gemini": "message-content, div.user-query, div.model-response",
"claude": "div[data-testid='user-message'], div.font-claude-response",
"gemini": "message-content, div.user-query, div.model-response, .query-text",
}.get(provider)
if selector:
try:
page.wait_for_selector(selector, timeout=12000)
except Exception as exc:
print(f"[scrape] timed out waiting for {provider} content: {exc}", flush=True)

page.wait_for_timeout(2000)
page.wait_for_timeout(5000 if provider == "claude" else 2000)
final_url = page.url
html = page.content()
finally:
Expand Down Expand Up @@ -660,14 +662,37 @@ def _extract_chat_pairs(url: str, html: str) -> tuple[str, str, list[dict[str, s
extraction_method = "structured"
except Exception as exc:
print(f"[scrape] Claude parse warning: {exc}", flush=True)
if not pairs:
user_msgs = soup.select("div[data-testid='user-message']")
asst_msgs = soup.select("div.font-claude-response")
for user_msg, assistant_msg in zip(user_msgs, asst_msgs):
pairs.append({
"user_query": user_msg.get_text(separator="\n", strip=True),
"agent_response": assistant_msg.get_text(separator="\n", strip=True),
})
if pairs:
extraction_method = "dom"

elif provider == "gemini":
user_blocks = soup.select("message-content[role='user'], div.user-query")
model_blocks = soup.select("message-content[role='model'], div.model-response")
user_blocks = soup.select(
"message-content[role='user'], div.user-query, .query-text"
)
model_blocks = soup.select(
"message-content[role='model'], div.model-response, "
"structured-content-container.message-content message-content, "
"message-content:not([role])"
)
for user_block, model_block in zip(user_blocks, model_blocks):
user_text = user_block.get_text(separator="\n", strip=True)
user_labels = {"you said", "your prompt", "あなたの入力", "あなたのプロンプト"}
user_lines = [
line.strip()
for line in user_text.splitlines()
if line.strip() and line.strip().lower() not in user_labels
]
pairs.append({
"user_query": user_block.get_text(separator="\n").strip(),
"agent_response": model_block.get_text(separator="\n").strip(),
"user_query": "\n".join(user_lines),
"agent_response": model_block.get_text(separator="\n", strip=True),
})
if pairs:
extraction_method = "dom"
Expand Down
23 changes: 23 additions & 0 deletions src/api/chat_share.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from __future__ import annotations

from typing import Any


def scrape_failure_message(result: dict[str, Any]) -> str:
provider = result.get("provider") or "unknown"

if provider in {"chatgpt", "claude", "gemini"}:
display_name = {
"chatgpt": "ChatGPT",
"claude": "Claude",
"gemini": "Gemini",
}[provider]
return (
f"Could not extract messages from this {display_name} share link. "
"Make sure the link is public, still exists, and has not expired."
)

return (
"Failed to extract messages from the provided link. "
"Supported public share links are ChatGPT, Claude, and Gemini."
)
45 changes: 35 additions & 10 deletions src/api/routes/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
require_api_key,
require_ready,
)
from src.api.chat_share import scrape_failure_message
from src.api.schemas import (
APIResponse,
BatchIngestRequest,
Expand Down Expand Up @@ -154,7 +155,7 @@ def _get_or_create_browser():
_pw_instance = sync_playwright().start()

launch_errors = []
for channel in (None, "msedge", "chrome"):
for channel in ("chromium", None, "msedge", "chrome"):
try:
kwargs = {"headless": True}
if channel:
Expand Down Expand Up @@ -209,17 +210,17 @@ def _block_heavy_assets(route):
provider = _detect_chat_provider(page.url or url)
selector = {
"chatgpt": "div[data-message-author-role]",
"claude": "script",
"gemini": "message-content, div.user-query, div.model-response",
"claude": "div[data-testid='user-message'], div.font-claude-response",
"gemini": "message-content, div.user-query, div.model-response, .query-text",
}.get(provider)
if selector:
try:
page.wait_for_selector(selector, timeout=8000)
except Exception as exc:
logger.warning("Timed out waiting for %s content: %s", provider, exc)

# No hardcoded sleep — the selector wait above already guarantees
# the chat content DOM nodes are present.
if provider == "claude":
page.wait_for_timeout(5000)

final_url = page.url
html = page.content()
Expand Down Expand Up @@ -273,14 +274,37 @@ def _extract_chat_pairs(url: str, html: str) -> tuple[str, str, List[MessagePair
extraction_method = "structured"
except Exception as exc:
logger.warning("Failed to parse Claude preloaded state: %s", exc)
if not pairs:
user_msgs = soup.select("div[data-testid='user-message']")
asst_msgs = soup.select("div.font-claude-response")
for u, a in zip(user_msgs, asst_msgs):
pairs.append(MessagePair(
user_query=u.get_text(separator="\n", strip=True),
agent_response=a.get_text(separator="\n", strip=True),
))
if pairs:
extraction_method = "dom"

elif provider == "gemini":
user_blocks = soup.select("message-content[role='user'], div.user-query")
model_blocks = soup.select("message-content[role='model'], div.model-response")
user_blocks = soup.select(
"message-content[role='user'], div.user-query, .query-text"
)
model_blocks = soup.select(
"message-content[role='model'], div.model-response, "
"structured-content-container.message-content message-content, "
"message-content:not([role])"
)
for u, m in zip(user_blocks, model_blocks):
user_text = u.get_text(separator="\n", strip=True)
user_labels = {"you said", "your prompt", "あなたの入力", "あなたのプロンプト"}
user_lines = [
line.strip()
for line in user_text.splitlines()
if line.strip() and line.strip().lower() not in user_labels
]
pairs.append(MessagePair(
user_query=u.get_text(separator="\n").strip(),
agent_response=m.get_text(separator="\n").strip(),
user_query="\n".join(user_lines),
agent_response=m.get_text(separator="\n", strip=True),
))
if pairs:
extraction_method = "dom"
Expand Down Expand Up @@ -757,7 +781,8 @@ async def scrape_chat_link(req: ScrapeRequest, request: Request):
pairs = result["pairs"]

if not pairs:
return _error(request, "Failed to extract messages from the provided link.", 400)
elapsed = round((time.perf_counter() - start) * 1000, 2)
return _error(request, scrape_failure_message(result), 400, elapsed)

data = ScrapeResponse(pairs=pairs)
elapsed = round((time.perf_counter() - start) * 1000, 2)
Comment on lines 783 to 788
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The error response for missing message pairs is missing the elapsed_ms timing information, which results in a default value of 0.0. This is inconsistent with the success response and the implementation in server.py. Calculating the elapsed time before the check ensures that the user receives accurate timing even when extraction fails.

Suggested change
if not pairs:
return _error(request, "Failed to extract messages from the provided link.", 400)
return _error(request, _scrape_failure_message(result), 400)
data = ScrapeResponse(pairs=pairs)
elapsed = round((time.perf_counter() - start) * 1000, 2)
elapsed = round((time.perf_counter() - start) * 1000, 2)
if not pairs:
return _error(request, _scrape_failure_message(result), 400, elapsed)
data = ScrapeResponse(pairs=pairs)

Expand Down
182 changes: 182 additions & 0 deletions tests/test_chat_share_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import json
import os
from types import SimpleNamespace

os.environ.setdefault("PINECONE_API_KEY", "test-pinecone-key")
os.environ.setdefault("NEO4J_PASSWORD", "test-neo4j-password")
os.environ.setdefault("GEMINI_API_KEY", "test-gemini-key")

from src.api.chat_share import scrape_failure_message
from src.api.routes.memory import (
_detect_chat_provider,
_extract_chat_pairs,
scrape_chat_link,
)
from src.api.schemas import MessagePair, ScrapeRequest


def test_detects_supported_chat_share_providers():
assert _detect_chat_provider("https://chatgpt.com/share/abc") == "chatgpt"
assert _detect_chat_provider("https://chat.openai.com/share/abc") == "chatgpt"
assert _detect_chat_provider("https://claude.ai/share/abc") == "claude"
assert _detect_chat_provider("https://gemini.google.com/share/abc") == "gemini"
assert _detect_chat_provider("https://g.co/gemini/share/abc") == "gemini"


def test_extracts_chatgpt_dom_pairs():
html = """
<div data-message-author-role="user">What is XMem?</div>
<div data-message-author-role="assistant">A long-term memory layer.</div>
"""

provider, method, pairs = _extract_chat_pairs("https://chatgpt.com/share/abc", html)

assert provider == "chatgpt"
assert method == "dom"
assert pairs == [
MessagePair(
user_query="What is XMem?",
agent_response="A long-term memory layer.",
)
]


def test_extracts_claude_preloaded_state_pairs():
state = {
"chat": {
"messages": [
{"sender": "human", "text": "Summarize this repo."},
{"sender": "assistant", "text": "It stores memories for agents."},
]
}
}
html = (
"<script>window.__PRELOADED_STATE__ = "
f"{json.dumps(state)};"
"</script>"
)

provider, method, pairs = _extract_chat_pairs("https://claude.ai/share/abc", html)

assert provider == "claude"
assert method == "structured"
assert pairs == [
MessagePair(
user_query="Summarize this repo.",
agent_response="It stores memories for agents.",
)
]


def test_extracts_claude_current_public_share_dom_pairs():
html = """
<div data-testid="user-message">
<p class="whitespace-pre-wrap break-words">test test</p>
</div>
<div class="font-claude-response relative leading">
<div>
<div class="standard-markdown">
<p class="font-claude-response-body">Hey! I'm here and working.</p>
</div>
</div>
</div>
"""

provider, method, pairs = _extract_chat_pairs("https://claude.ai/share/abc", html)

assert provider == "claude"
assert method == "dom"
assert pairs == [
MessagePair(
user_query="test test",
agent_response="Hey! I'm here and working.",
)
]


def test_extracts_gemini_dom_pairs():
html = """
<message-content role="user">Compare memory tools.</message-content>
<message-content role="model">XMem focuses on persistent agent memory.</message-content>
"""

provider, method, pairs = _extract_chat_pairs(
"https://gemini.google.com/share/abc",
html,
)

assert provider == "gemini"
assert method == "dom"
assert pairs == [
MessagePair(
user_query="Compare memory tools.",
agent_response="XMem focuses on persistent agent memory.",
)
]


def test_extracts_gemini_current_public_share_dom_pairs():
html = """
<div class="query-text">
<span class="screen-reader-user-query-label"> You said </span>
<p class="query-text-line"> Test test </p>
</div>
<structured-content-container class="message-content">
<message-content>
<div class="markdown">
<p>Loud and clear! I'm here and ready to roll.</p>
</div>
</message-content>
</structured-content-container>
"""

provider, method, pairs = _extract_chat_pairs(
"https://gemini.google.com/share/abc",
html,
)

assert provider == "gemini"
assert method == "dom"
assert pairs == [
MessagePair(
user_query="Test test",
agent_response="Loud and clear! I'm here and ready to roll.",
)
]


def test_scrape_failure_message_names_private_or_missing_provider_links():
message = scrape_failure_message({"provider": "claude"})

assert "Claude share link" in message
assert "public" in message
assert "expired" in message


def test_scrape_failure_message_lists_supported_unknown_links():
message = scrape_failure_message({"provider": "unknown"})

assert "Supported public share links" in message
assert "ChatGPT" in message
assert "Claude" in message
assert "Gemini" in message


async def test_scrape_route_failure_uses_elapsed_ms(monkeypatch):
async def fake_scrape(url: str):
return {"provider": "gemini", "pairs": []}

ticks = iter([10.0, 10.12345])

monkeypatch.setattr("src.api.routes.memory._scrape_chat_share", fake_scrape)
monkeypatch.setattr("src.api.routes.memory.time.perf_counter", lambda: next(ticks))

response = await scrape_chat_link(
ScrapeRequest(url="https://gemini.google.com/share/abc"),
SimpleNamespace(state=SimpleNamespace(request_id="req-test")),
)
body = json.loads(response.body)

assert response.status_code == 400
assert body["elapsed_ms"] == 123.45
assert "Gemini share link" in body["error"]
Loading