Hedge funds, quant traders, and fintech startups spend $5,000โ$50,000/month on financial data feeds from Bloomberg, Refinitiv, and FactSet. But most of that data originates from publicly available sources โ SEC filings, earnings call transcripts, company websites, and financial news portals.
With AI-powered web scraping, you can build automated financial intelligence systems that collect, structure, and analyze this data at a fraction of the cost. In this guide, we'll build a complete financial data pipeline using Python, the WebPerception API, and GPT-4o for AI-powered analysis.
Traditional financial data scraping is brittle. Stock tickers, earnings tables, and filing formats vary wildly across sources. CSS selectors break every quarter when sites redesign.
AI extraction solves this by understanding the content semantically โ it reads financial tables, extracts key metrics, and structures data regardless of HTML layout changes.
Our system follows six steps:
Start by defining exactly what financial data you want to extract. Pydantic models ensure clean, validated output:
from pydantic import BaseModel
from typing import Optional, List
from datetime import date
class StockQuote(BaseModel):
ticker: str
price: float
change_percent: float
volume: int
market_cap: Optional[str] = None
pe_ratio: Optional[float] = None
fifty_two_week_high: Optional[float] = None
fifty_two_week_low: Optional[float] = None
class EarningsReport(BaseModel):
company: str
ticker: str
quarter: str # e.g., "Q4 2025"
revenue: float # in millions
revenue_growth_yoy: Optional[float] = None
eps: float
eps_estimate: Optional[float] = None
eps_surprise: Optional[float] = None
net_income: Optional[float] = None
guidance_revenue: Optional[str] = None
guidance_eps: Optional[str] = None
key_highlights: List[str] = []
class SECFiling(BaseModel):
company: str
ticker: str
filing_type: str # 10-K, 10-Q, 8-K, etc.
filed_date: str
period_end: str
url: str
description: Optional[str] = None
class MarketNews(BaseModel):
headline: str
source: str
published: str
url: str
tickers_mentioned: List[str] = []
sentiment: str # bullish, bearish, neutral
summary: str
Use the WebPerception API to scrape and extract structured financial data from any source:
import httpx
import json
from typing import List
MANTIS_API_KEY = "your-api-key"
BASE_URL = "https://api.mantisapi.com/v1"
async def extract_stock_quotes(url: str) -> List[dict]:
"""Extract stock quote data from any financial page."""
async with httpx.AsyncClient(timeout=30) as client:
response = await client.post(
f"{BASE_URL}/extract",
headers={"Authorization": f"Bearer {MANTIS_API_KEY}"},
json={
"url": url,
"schema": {
"stocks": [{
"ticker": "string - stock ticker symbol",
"price": "number - current stock price",
"change_percent": "number - percent change today",
"volume": "number - trading volume",
"market_cap": "string - market capitalization",
"pe_ratio": "number - P/E ratio if available"
}]
}
}
)
data = response.json()
return data.get("extracted", {}).get("stocks", [])
async def extract_earnings(url: str) -> dict:
"""Extract earnings report data from an earnings page or press release."""
async with httpx.AsyncClient(timeout=30) as client:
response = await client.post(
f"{BASE_URL}/extract",
headers={"Authorization": f"Bearer {MANTIS_API_KEY}"},
json={
"url": url,
"schema": {
"company": "string",
"ticker": "string",
"quarter": "string - fiscal quarter e.g. Q4 2025",
"revenue_millions": "number - total revenue in millions USD",
"revenue_growth_yoy": "number - year-over-year revenue growth %",
"eps": "number - earnings per share",
"eps_estimate": "number - analyst consensus EPS estimate",
"net_income_millions": "number - net income in millions",
"guidance_revenue": "string - forward revenue guidance",
"guidance_eps": "string - forward EPS guidance",
"key_highlights": ["string - notable items from the report"]
}
}
)
return response.json().get("extracted", {})
import sqlite3
from datetime import datetime
def init_db():
conn = sqlite3.connect("financial_data.db")
c = conn.cursor()
c.execute("""CREATE TABLE IF NOT EXISTS stock_prices (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ticker TEXT NOT NULL,
price REAL NOT NULL,
change_percent REAL,
volume INTEGER,
market_cap TEXT,
pe_ratio REAL,
scraped_at TEXT NOT NULL,
source_url TEXT
)""")
c.execute("""CREATE TABLE IF NOT EXISTS earnings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ticker TEXT NOT NULL,
quarter TEXT NOT NULL,
revenue_millions REAL,
revenue_growth_yoy REAL,
eps REAL,
eps_estimate REAL,
eps_surprise REAL,
net_income_millions REAL,
guidance_revenue TEXT,
guidance_eps TEXT,
key_highlights TEXT,
scraped_at TEXT NOT NULL,
source_url TEXT,
UNIQUE(ticker, quarter)
)""")
c.execute("""CREATE TABLE IF NOT EXISTS market_signals (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ticker TEXT NOT NULL,
signal_type TEXT NOT NULL,
severity TEXT NOT NULL,
description TEXT NOT NULL,
ai_analysis TEXT,
created_at TEXT NOT NULL
)""")
conn.commit()
return conn
def store_stock_price(conn, quote: dict, source_url: str):
conn.execute(
"""INSERT INTO stock_prices
(ticker, price, change_percent, volume, market_cap, pe_ratio, scraped_at, source_url)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
(quote["ticker"], quote["price"], quote.get("change_percent"),
quote.get("volume"), quote.get("market_cap"), quote.get("pe_ratio"),
datetime.utcnow().isoformat(), source_url)
)
conn.commit()
def store_earnings(conn, earnings: dict, source_url: str):
eps = earnings.get("eps", 0)
eps_est = earnings.get("eps_estimate")
surprise = round(eps - eps_est, 4) if eps_est else None
conn.execute(
"""INSERT OR REPLACE INTO earnings
(ticker, quarter, revenue_millions, revenue_growth_yoy, eps, eps_estimate,
eps_surprise, net_income_millions, guidance_revenue, guidance_eps,
key_highlights, scraped_at, source_url)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(earnings["ticker"], earnings["quarter"], earnings.get("revenue_millions"),
earnings.get("revenue_growth_yoy"), eps, eps_est, surprise,
earnings.get("net_income_millions"), earnings.get("guidance_revenue"),
earnings.get("guidance_eps"), json.dumps(earnings.get("key_highlights", [])),
datetime.utcnow().isoformat(), source_url)
)
conn.commit()
The real value is automated detection of meaningful changes โ earnings surprises, guidance revisions, unusual volume:
def detect_earnings_signals(conn, earnings: dict) -> List[dict]:
"""Detect significant signals from earnings data."""
signals = []
ticker = earnings["ticker"]
# Earnings surprise detection
eps = earnings.get("eps", 0)
eps_est = earnings.get("eps_estimate")
if eps_est and eps_est != 0:
surprise_pct = ((eps - eps_est) / abs(eps_est)) * 100
if abs(surprise_pct) > 10:
signals.append({
"ticker": ticker,
"signal_type": "earnings_surprise",
"severity": "critical" if abs(surprise_pct) > 20 else "important",
"description": f"{'Beat' if surprise_pct > 0 else 'Missed'} EPS estimate by {abs(surprise_pct):.1f}% "
f"(actual: ${eps:.2f} vs est: ${eps_est:.2f})"
})
# Revenue growth acceleration/deceleration
growth = earnings.get("revenue_growth_yoy")
if growth is not None:
# Check previous quarter
prev = conn.execute(
"SELECT revenue_growth_yoy FROM earnings WHERE ticker = ? ORDER BY id DESC LIMIT 1",
(ticker,)
).fetchone()
if prev and prev[0] is not None:
delta = growth - prev[0]
if abs(delta) > 5:
direction = "accelerating" if delta > 0 else "decelerating"
signals.append({
"ticker": ticker,
"signal_type": "growth_trend",
"severity": "important",
"description": f"Revenue growth {direction}: {growth:.1f}% vs {prev[0]:.1f}% prior quarter"
})
return signals
def detect_price_signals(conn, ticker: str) -> List[dict]:
"""Detect unusual price movements from stored history."""
signals = []
rows = conn.execute(
"SELECT price, change_percent, volume FROM stock_prices WHERE ticker = ? ORDER BY scraped_at DESC LIMIT 10",
(ticker,)
).fetchall()
if len(rows) < 2:
return signals
latest_price, latest_change, latest_vol = rows[0]
# Big daily move
if latest_change and abs(latest_change) > 5:
signals.append({
"ticker": ticker,
"signal_type": "price_movement",
"severity": "critical" if abs(latest_change) > 10 else "important",
"description": f"{'Up' if latest_change > 0 else 'Down'} {abs(latest_change):.1f}% today (${latest_price:.2f})"
})
# Unusual volume (2x average)
if latest_vol and len(rows) >= 5:
avg_vol = sum(r[2] for r in rows[1:6] if r[2]) / min(5, len(rows) - 1)
if avg_vol > 0 and latest_vol > avg_vol * 2:
signals.append({
"ticker": ticker,
"signal_type": "unusual_volume",
"severity": "important",
"description": f"Volume {latest_vol:,} is {latest_vol/avg_vol:.1f}x the 5-day average"
})
return signals
This is where AI truly shines โ interpreting what the data means, not just what changed:
from openai import OpenAI
client = OpenAI()
def analyze_earnings_with_ai(earnings: dict, signals: list, historical: list) -> str:
"""Use GPT-4o to generate investment-grade analysis."""
context = f"""
EARNINGS REPORT โ {earnings['ticker']} {earnings['quarter']}
Revenue: ${earnings.get('revenue_millions', 'N/A')}M
Revenue Growth YoY: {earnings.get('revenue_growth_yoy', 'N/A')}%
EPS: ${earnings.get('eps', 'N/A')}
EPS Estimate: ${earnings.get('eps_estimate', 'N/A')}
Net Income: ${earnings.get('net_income_millions', 'N/A')}M
Guidance Revenue: {earnings.get('guidance_revenue', 'N/A')}
Guidance EPS: {earnings.get('guidance_eps', 'N/A')}
Key Highlights: {json.dumps(earnings.get('key_highlights', []))}
DETECTED SIGNALS:
{json.dumps(signals, indent=2)}
HISTORICAL CONTEXT (last 4 quarters):
{json.dumps(historical, indent=2)}
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": """You are a senior financial analyst. Analyze the earnings report and provide:
1. HEADLINE: One-sentence summary of the quarter
2. BEAT/MISS: Did they beat or miss expectations and by how much?
3. GROWTH TREND: Is growth accelerating, stable, or decelerating?
4. GUIDANCE SIGNAL: Is forward guidance above, in-line, or below consensus?
5. KEY RISKS: Top 2-3 risks from this report
6. BULL CASE: What's the positive interpretation?
7. BEAR CASE: What's the negative interpretation?
8. VERDICT: BULLISH / NEUTRAL / BEARISH with confidence (high/medium/low)
Be concise. Use data. No fluff."""},
{"role": "user", "content": context}
],
temperature=0.3
)
return response.choices[0].message.content
def analyze_market_news_batch(news_items: list, watchlist: list) -> str:
"""Analyze a batch of market news for watchlist relevance."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": f"""You are a market intelligence analyst monitoring these tickers: {', '.join(watchlist)}.
Analyze the news batch and identify:
1. MATERIAL NEWS: Anything that could move prices >2%
2. SENTIMENT SHIFT: Overall market mood changes
3. SECTOR SIGNALS: Industry-wide trends
4. ACTION ITEMS: Specific tickers that need attention and why
Rank by importance. Skip noise."""},
{"role": "user", "content": json.dumps(news_items, indent=2)}
],
temperature=0.3
)
return response.choices[0].message.content
import httpx
SLACK_WEBHOOK = "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
async def send_financial_alert(signal: dict, analysis: str):
"""Send critical financial signals to Slack."""
severity_emoji = {
"critical": "๐จ",
"important": "โ ๏ธ",
"minor": "โน๏ธ"
}
emoji = severity_emoji.get(signal["severity"], "๐")
message = f"""{emoji} *Financial Signal โ {signal['ticker']}*
*Type:* {signal['signal_type'].replace('_', ' ').title()}
*Severity:* {signal['severity'].upper()}
*Details:* {signal['description']}
*AI Analysis:*
{analysis}
"""
async with httpx.AsyncClient() as client:
await client.post(SLACK_WEBHOOK, json={"text": message})
async def generate_daily_report(conn, watchlist: list) -> str:
"""Generate end-of-day summary for all watched tickers."""
report_data = []
for ticker in watchlist:
latest = conn.execute(
"SELECT price, change_percent, volume FROM stock_prices WHERE ticker = ? ORDER BY scraped_at DESC LIMIT 1",
(ticker,)
).fetchone()
if latest:
report_data.append({
"ticker": ticker,
"price": latest[0],
"change": latest[1],
"volume": latest[2]
})
signals = conn.execute(
"SELECT ticker, signal_type, severity, description FROM market_signals WHERE date(created_at) = date('now') ORDER BY severity"
).fetchall()
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "Generate a concise end-of-day market summary. Lead with the most important moves. Include a brief outlook for tomorrow."},
{"role": "user", "content": f"Portfolio:\n{json.dumps(report_data, indent=2)}\n\nToday's Signals:\n{json.dumps([dict(zip(['ticker','type','severity','desc'], s)) for s in signals], indent=2)}"}
],
temperature=0.3
)
return response.choices[0].message.content
import asyncio
WATCHLIST = ["AAPL", "GOOGL", "MSFT", "NVDA", "TSLA", "META", "AMZN"]
FINANCIAL_SOURCES = {
"quotes": [
"https://finance.yahoo.com/quotes/{ticker}",
],
"earnings": [
"https://finance.yahoo.com/quote/{ticker}/financials",
],
"news": [
"https://finance.yahoo.com/quote/{ticker}/news",
]
}
async def run_financial_pipeline():
conn = init_db()
for ticker in WATCHLIST:
print(f"\n๐ Processing {ticker}...")
# 1. Scrape stock quote
quote_url = FINANCIAL_SOURCES["quotes"][0].format(ticker=ticker)
quotes = await extract_stock_quotes(quote_url)
for q in quotes:
if q.get("ticker", "").upper() == ticker:
store_stock_price(conn, q, quote_url)
price_signals = detect_price_signals(conn, ticker)
for sig in price_signals:
print(f" โก Signal: {sig['description']}")
# 2. Check for earnings (during earnings season)
earnings_url = FINANCIAL_SOURCES["earnings"][0].format(ticker=ticker)
earnings = await extract_earnings(earnings_url)
if earnings.get("quarter"):
store_earnings(conn, earnings, earnings_url)
earnings_signals = detect_earnings_signals(conn, earnings)
for sig in earnings_signals:
print(f" โก Signal: {sig['description']}")
# AI analysis for critical signals
if sig["severity"] == "critical":
historical = conn.execute(
"SELECT quarter, revenue_millions, eps FROM earnings WHERE ticker = ? ORDER BY id DESC LIMIT 4",
(ticker,)
).fetchall()
analysis = analyze_earnings_with_ai(
earnings, [sig],
[dict(zip(["quarter", "revenue", "eps"], h)) for h in historical]
)
await send_financial_alert(sig, analysis)
await asyncio.sleep(1) # Rate limiting
# 3. Generate daily report
report = await generate_daily_report(conn, WATCHLIST)
print(f"\n๐ Daily Report:\n{report}")
conn.close()
if __name__ == "__main__":
asyncio.run(run_financial_pipeline())
| Use Case | Data Sources | AI Value-Add | Who Pays |
|---|---|---|---|
| Earnings Monitoring | Company IR pages, SEC EDGAR, press releases | Auto-detect beats/misses, guidance changes, sentiment shifts | Hedge funds, analysts ($5K-$50K/mo) |
| Alternative Data | Job postings, product reviews, app store rankings | Correlate non-financial signals with stock performance | Quant funds ($10K-$100K/mo) |
| SEC Filing Analysis | EDGAR 10-K, 10-Q, 8-K, insider trades | Extract risk factors, compare QoQ language changes | Compliance, research firms ($2K-$10K/mo) |
| Fintech Data Products | Multiple financial sites, APIs, news | Build and resell structured datasets with AI enrichment | Data vendors ($5K-$50K/mo per client) |
| Approach | Monthly Cost | Coverage | AI Analysis |
|---|---|---|---|
| Bloomberg Terminal | $2,000โ$2,500 | Comprehensive | Limited |
| Refinitiv / FactSet | $1,500โ$3,000 | Comprehensive | Basic |
| Alternative data vendors | $5,000โ$50,000 | Niche | None |
| AI Agent + Mantis API | $29โ$299 | Customizable | Full GPT-4o analysis |
The fastest path to a financial data pipeline:
Start scraping and analyzing financial data with AI extraction. Free tier includes 100 API calls/month.
Get Your API Key โ