Reddit is the internet's largest community-driven forum, with over 1.7 billion monthly visits across 100,000+ active subreddits. It's one of the most valuable data sources for:
Google even signed a $60 million/year deal with Reddit for AI training data — that's how valuable this data is. But since 2023, getting Reddit data has become much harder and more expensive.
In June 2023, Reddit dramatically changed its API pricing, effectively killing free access for most developers:
| Tier | Price | Rate Limit | Use Case |
|---|---|---|---|
| Free | $0/mo | 100 req/min | Non-commercial, personal only |
| Commercial | ~$0.24/1K calls | Negotiated | Any commercial use |
| Enterprise | Custom (millions/yr) | Custom | AI training, large-scale |
The 2023 pricing changes killed dozens of popular third-party Reddit apps (Apollo, Reddit is Fun, Sync, etc.) and forced researchers and developers to find alternative data access methods. The free tier is restricted to non-commercial use, and any business application requires a paid agreement.
Here are four approaches to extract data from Reddit, from lightweight JSON endpoints to production-ready API solutions:
Reddit has a little-known feature: you can append .json to virtually any Reddit URL to get structured JSON data. This is the simplest scraping method — no API key required, no browser rendering needed.
pip install requests
import requests
import time
import json
class RedditScraper:
"""Scrape Reddit using public JSON endpoints."""
BASE_URL = "https://www.reddit.com"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
})
self.last_request = 0
def _rate_limit(self):
"""Respect Reddit's rate limits — max 10 req/min unauthenticated."""
elapsed = time.time() - self.last_request
if elapsed < 6: # ~10 requests per minute
time.sleep(6 - elapsed)
self.last_request = time.time()
def get_subreddit_posts(self, subreddit, sort="hot", limit=25, after=None):
"""Fetch posts from a subreddit."""
self._rate_limit()
params = {"limit": min(limit, 100)}
if after:
params["after"] = after
url = f"{self.BASE_URL}/r/{subreddit}/{sort}.json"
resp = self.session.get(url, params=params)
resp.raise_for_status()
data = resp.json()
posts = []
for child in data["data"]["children"]:
post = child["data"]
posts.append({
"id": post["id"],
"title": post["title"],
"author": post["author"],
"score": post["score"],
"upvote_ratio": post["upvote_ratio"],
"num_comments": post["num_comments"],
"url": post["url"],
"selftext": post.get("selftext", "")[:500],
"created_utc": post["created_utc"],
"permalink": post["permalink"],
"subreddit": post["subreddit"],
"flair": post.get("link_flair_text", ""),
"is_video": post.get("is_video", False),
"thumbnail": post.get("thumbnail", ""),
})
next_page = data["data"].get("after")
return posts, next_page
def get_post_comments(self, subreddit, post_id, sort="best"):
"""Fetch comments for a specific post."""
self._rate_limit()
url = f"{self.BASE_URL}/r/{subreddit}/comments/{post_id}.json"
params = {"sort": sort}
resp = self.session.get(url, params=params)
resp.raise_for_status()
data = resp.json()
# First element is the post, second is comments
post_data = data[0]["data"]["children"][0]["data"]
comments_data = data[1]["data"]["children"]
comments = []
for child in comments_data:
if child["kind"] != "t1": # Skip "more" placeholders
continue
comment = child["data"]
comments.append({
"id": comment["id"],
"author": comment["author"],
"body": comment["body"],
"score": comment["score"],
"created_utc": comment["created_utc"],
"is_op": comment["author"] == post_data["author"],
"depth": comment.get("depth", 0),
})
return post_data, comments
def search_subreddit(self, subreddit, query, sort="relevance", limit=25):
"""Search within a subreddit."""
self._rate_limit()
url = f"{self.BASE_URL}/r/{subreddit}/search.json"
params = {
"q": query,
"restrict_sr": "on",
"sort": sort,
"limit": min(limit, 100),
}
resp = self.session.get(url, params=params)
resp.raise_for_status()
data = resp.json()
posts = []
for child in data["data"]["children"]:
post = child["data"]
posts.append({
"id": post["id"],
"title": post["title"],
"author": post["author"],
"score": post["score"],
"num_comments": post["num_comments"],
"permalink": post["permalink"],
"selftext": post.get("selftext", "")[:300],
})
return posts
def get_user_posts(self, username, sort="new", limit=25):
"""Fetch a user's recent posts."""
self._rate_limit()
url = f"{self.BASE_URL}/user/{username}/submitted.json"
params = {"sort": sort, "limit": min(limit, 100)}
resp = self.session.get(url, params=params)
resp.raise_for_status()
data = resp.json()
posts = []
for child in data["data"]["children"]:
post = child["data"]
posts.append({
"title": post["title"],
"subreddit": post["subreddit"],
"score": post["score"],
"num_comments": post["num_comments"],
"created_utc": post["created_utc"],
})
return posts
# Usage
scraper = RedditScraper()
# Get top posts from r/python
posts, next_page = scraper.get_subreddit_posts("python", sort="top", limit=10)
for p in posts:
print(f"[{p['score']:>5}] {p['title'][:70]}")
print(f" {p['num_comments']} comments | by u/{p['author']}")
print()
# Get comments on a specific post
post, comments = scraper.get_post_comments("python", posts[0]["id"])
print(f"\nComments on: {post['title']}")
for c in comments[:5]:
print(f" u/{c['author']} ({c['score']} pts): {c['body'][:100]}...")
# Search within a subreddit
results = scraper.search_subreddit("webdev", "web scraping API")
print(f"\nSearch results: {len(results)} posts found")
def scrape_all_posts(scraper, subreddit, sort="top", max_posts=200):
"""Scrape multiple pages of posts using Reddit's pagination."""
all_posts = []
after = None
while len(all_posts) < max_posts:
batch_size = min(100, max_posts - len(all_posts))
posts, after = scraper.get_subreddit_posts(
subreddit, sort=sort, limit=batch_size, after=after
)
if not posts:
break
all_posts.extend(posts)
print(f"Fetched {len(all_posts)} posts...")
if not after:
break
return all_posts[:max_posts]
# Scrape 200 top posts from r/machinelearning
all_posts = scrape_all_posts(scraper, "machinelearning", sort="top", max_posts=200)
print(f"Total posts scraped: {len(all_posts)}")
PRAW (Python Reddit API Wrapper) is the official Python library for the Reddit API. It provides a cleaner interface than raw JSON endpoints, but requires API credentials.
pip install praw
import praw
from datetime import datetime
# Initialize Reddit API client
reddit = praw.Reddit(
client_id="YOUR_CLIENT_ID",
client_secret="YOUR_CLIENT_SECRET",
user_agent="DataScraper/1.0 by YourUsername"
)
def scrape_subreddit_praw(subreddit_name, sort="hot", limit=50):
"""Scrape subreddit posts using PRAW."""
subreddit = reddit.subreddit(subreddit_name)
# Choose sort method
if sort == "hot":
posts = subreddit.hot(limit=limit)
elif sort == "top":
posts = subreddit.top(limit=limit, time_filter="month")
elif sort == "new":
posts = subreddit.new(limit=limit)
elif sort == "rising":
posts = subreddit.rising(limit=limit)
else:
posts = subreddit.hot(limit=limit)
results = []
for post in posts:
results.append({
"id": post.id,
"title": post.title,
"author": str(post.author),
"score": post.score,
"upvote_ratio": post.upvote_ratio,
"num_comments": post.num_comments,
"url": post.url,
"selftext": post.selftext[:500] if post.selftext else "",
"created": datetime.fromtimestamp(post.created_utc).isoformat(),
"permalink": f"https://reddit.com{post.permalink}",
"flair": post.link_flair_text,
"awards": post.total_awards_received,
})
return results
def scrape_comments_praw(post_url, limit=100):
"""Scrape all comments from a post using PRAW."""
submission = reddit.submission(url=post_url)
submission.comments.replace_more(limit=5) # Expand "load more" threads
comments = []
for comment in submission.comments.list()[:limit]:
comments.append({
"id": comment.id,
"author": str(comment.author),
"body": comment.body,
"score": comment.score,
"created": datetime.fromtimestamp(comment.created_utc).isoformat(),
"parent_id": comment.parent_id,
"depth": comment.depth,
"is_op": str(comment.author) == str(submission.author),
})
return comments
def search_reddit_praw(query, subreddit=None, sort="relevance", limit=50):
"""Search across Reddit or within a subreddit."""
if subreddit:
target = reddit.subreddit(subreddit)
else:
target = reddit.subreddit("all")
results = []
for post in target.search(query, sort=sort, limit=limit):
results.append({
"title": post.title,
"subreddit": str(post.subreddit),
"score": post.score,
"num_comments": post.num_comments,
"url": post.url,
"permalink": f"https://reddit.com{post.permalink}",
})
return results
# Usage
posts = scrape_subreddit_praw("artificial", sort="top", limit=20)
for p in posts:
print(f"[{p['score']:>5}] r/{p['flair'] or 'N/A'} | {p['title'][:60]}")
# Search for web scraping discussions
results = search_reddit_praw("web scraping API best", sort="relevance", limit=10)
for r in results:
print(f"r/{r['subreddit']}: {r['title'][:60]} ({r['score']} pts)")
The old Reddit interface (old.reddit.com) is much simpler HTML — no JavaScript rendering required. Combined with Reddit's JSON endpoints, this makes Node.js + Cheerio an excellent lightweight option.
npm install axios cheerio
const axios = require('axios');
const cheerio = require('cheerio');
class RedditScraper {
constructor() {
this.client = axios.create({
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' +
'AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36'
},
timeout: 15000,
});
this.lastRequest = 0;
}
async rateLimit() {
const elapsed = Date.now() - this.lastRequest;
if (elapsed < 6000) {
await new Promise(r => setTimeout(r, 6000 - elapsed));
}
this.lastRequest = Date.now();
}
async getSubredditPosts(subreddit, sort = 'hot', limit = 25) {
await this.rateLimit();
// Use JSON endpoint for structured data
const url = `https://www.reddit.com/r/${subreddit}/${sort}.json?limit=${limit}`;
const { data } = await this.client.get(url);
return data.data.children.map(child => {
const post = child.data;
return {
id: post.id,
title: post.title,
author: post.author,
score: post.score,
upvoteRatio: post.upvote_ratio,
numComments: post.num_comments,
url: post.url,
selftext: (post.selftext || '').slice(0, 500),
createdUtc: post.created_utc,
permalink: post.permalink,
subreddit: post.subreddit,
flair: post.link_flair_text,
};
});
}
async getPostComments(subreddit, postId, sort = 'best') {
await this.rateLimit();
const url = `https://www.reddit.com/r/${subreddit}/comments/${postId}.json?sort=${sort}`;
const { data } = await this.client.get(url);
const postData = data[0].data.children[0].data;
const comments = data[1].data.children
.filter(c => c.kind === 't1')
.map(c => {
const comment = c.data;
return {
id: comment.id,
author: comment.author,
body: comment.body,
score: comment.score,
createdUtc: comment.created_utc,
depth: comment.depth || 0,
};
});
return { post: postData, comments };
}
async scrapeOldReddit(subreddit, pages = 3) {
/**
* Scrape old.reddit.com HTML for additional data
* (thumbnails, expanded previews, etc.)
*/
const allPosts = [];
let after = null;
for (let i = 0; i < pages; i++) {
await this.rateLimit();
let url = `https://old.reddit.com/r/${subreddit}/`;
if (after) url += `?after=${after}`;
const { data: html } = await this.client.get(url);
const $ = cheerio.load(html);
$('div.thing[data-fullname]').each((_, el) => {
const $el = $(el);
allPosts.push({
id: $el.attr('data-fullname'),
title: $el.find('a.title').text().trim(),
author: $el.attr('data-author'),
score: parseInt($el.find('.score.unvoted').attr('title') || '0'),
comments: parseInt(
$el.find('.comments').text().match(/\d+/)?.[0] || '0'
),
domain: $el.attr('data-domain'),
timestamp: $el.find('time').attr('datetime'),
permalink: $el.find('a.comments').attr('href'),
thumbnail: $el.find('img.thumbnail').attr('src'),
});
});
// Get next page token
const nextBtn = $('span.next-button a');
if (nextBtn.length) {
const nextUrl = new URL(nextBtn.attr('href'), 'https://old.reddit.com');
after = nextUrl.searchParams.get('after');
} else {
break;
}
}
return allPosts;
}
async searchReddit(query, subreddit = null, sort = 'relevance', limit = 25) {
await this.rateLimit();
const sub = subreddit ? `r/${subreddit}/` : '';
const url = `https://www.reddit.com/${sub}search.json`;
const params = {
q: query,
sort,
limit,
...(subreddit ? { restrict_sr: 'on' } : {}),
};
const { data } = await this.client.get(url, { params });
return data.data.children.map(c => ({
title: c.data.title,
subreddit: c.data.subreddit,
score: c.data.score,
numComments: c.data.num_comments,
permalink: c.data.permalink,
}));
}
}
// Usage
(async () => {
const scraper = new RedditScraper();
// JSON endpoint approach
console.log('--- Top Posts from r/webdev ---');
const posts = await scraper.getSubredditPosts('webdev', 'top', 5);
posts.forEach(p => {
console.log(`[${p.score}] ${p.title.slice(0, 60)}`);
console.log(` ${p.numComments} comments | by u/${p.author}`);
});
// Get comments
console.log('\n--- Comments on Top Post ---');
const { post, comments } = await scraper.getPostComments('webdev', posts[0].id);
comments.slice(0, 3).forEach(c => {
console.log(` u/${c.author} (${c.score} pts): ${c.body.slice(0, 80)}...`);
});
// Old Reddit HTML scraping
console.log('\n--- Old Reddit HTML Scraping ---');
const oldPosts = await scraper.scrapeOldReddit('programming', 2);
console.log(`Scraped ${oldPosts.length} posts from old.reddit.com`);
// Search
console.log('\n--- Search Results ---');
const results = await scraper.searchReddit('web scraping python API');
results.slice(0, 5).forEach(r => {
console.log(`r/${r.subreddit}: ${r.title.slice(0, 50)} (${r.score} pts)`);
});
})();
For production applications, Mantis provides the most reliable way to extract Reddit data. One API call handles rendering, anti-bot bypassing, proxy rotation, and structured data extraction — with no rate limit headaches.
import requests
# Scrape a subreddit page
response = requests.post(
"https://api.mantisapi.com/v1/scrape",
headers={"x-api-key": "YOUR_API_KEY"},
json={
"url": "https://www.reddit.com/r/artificial/top/?t=week",
"render_js": True,
"wait_for": "[data-testid='post-container']",
"extract": {
"posts": {
"_selector": "[data-testid='post-container']",
"_type": "list",
"title": "a[data-click-id='body'] h3",
"score": "[data-click-id='upvote'] + div",
"comments": "a[data-click-id='comments'] span",
"author": "a[data-testid='post_author-text']",
}
}
}
)
data = response.json()
for post in data["extracted"]["posts"][:10]:
print(f"[{post['score']}] {post['title']}")
print(f" {post['comments']} | by {post['author']}")
print()
# Scrape comments from a specific post
response = requests.post(
"https://api.mantisapi.com/v1/scrape",
headers={"x-api-key": "YOUR_API_KEY"},
json={
"url": "https://www.reddit.com/r/python/comments/abc123/example_post/",
"render_js": True,
"scroll_count": 3,
"extract": {
"post_title": "h1",
"post_body": "[data-testid='post-content']",
"comments": {
"_selector": "[data-testid='comment']",
"_type": "list",
"author": "a[data-testid='comment_author-text']",
"body": "[data-testid='comment'] > div:last-child",
"score": "span[id*='vote-arrows']",
}
}
}
)
data = response.json()
print(f"Post: {data['extracted']['post_title']}")
for c in data["extracted"]["comments"][:10]:
print(f" u/{c['author']}: {c['body'][:100]}...")
Extract posts, comments, and subreddit data with a single API call. No API keys, no proxy management, no broken selectors.
View Pricing Get Started FreeReddit has increasingly aggressive anti-scraping measures. Understanding them is essential:
Reddit enforces strict rate limits: 10 requests per minute for unauthenticated users, 60 requests per minute with OAuth authentication. Exceeding these limits returns 429 (Too Many Requests) errors and can trigger temporary IP bans lasting minutes to hours.
Reddit blocks requests with generic or missing User-Agent headers. The API documentation requires a descriptive User-Agent string. Requests using common scraper User-Agents (python-requests, curl) are throttled or blocked outright.
Reddit can shadowban IPs or accounts suspected of scraping. Shadowbanned requests receive valid-looking but empty or limited responses — your scraper appears to work but returns incomplete data. This is particularly insidious because it's hard to detect.
Suspicious traffic triggers CAPTCHA challenges, especially on new Reddit (the React SPA). These require full browser execution to solve and cannot be bypassed with simple HTTP requests.
Reddit maintains blocklists of known datacenter IP ranges. Cloud server IPs (AWS, GCP, Azure) are often pre-blocked or heavily rate-limited. Residential proxies work better but add cost.
Some subreddits and content require authentication to view. NSFW content, quarantined subreddits, and age-restricted posts all require a logged-in session, which complicates scraping.
| Data Type | Fields | Auth Required? |
|---|---|---|
| Posts | Title, body, score, upvote ratio, comments count, author, flair, awards, URL, media | No |
| Comments | Body, author, score, timestamp, depth, parent ID, awards, edited status | No |
| Subreddits | Description, subscribers, active users, rules, wiki, flairs, moderators | No |
| User Profiles | Post/comment karma, account age, recent posts, recent comments, trophies | Partial |
| Search | Posts matching keywords, filtered by subreddit, time range, sort order | No |
| Wiki Pages | Full wiki content, revision history, contributors | No |
| Flairs | Post flairs, user flairs (per subreddit) | Partial |
| Awards | Award types, counts, gilding level | No |
Monitor what Reddit users think about your product or industry. Reddit discussions are brutally honest — making it the best source for unfiltered customer sentiment.
import requests
import json
from collections import Counter
from datetime import datetime, timedelta
class RedditSentimentAnalyzer:
"""Analyze sentiment across Reddit discussions."""
def __init__(self):
self.scraper = RedditScraper() # From Method 1
def analyze_brand(self, brand_name, subreddits=None, days=30):
"""Analyze brand sentiment across relevant subreddits."""
if not subreddits:
subreddits = ["technology", "startups", "SaaS", "webdev"]
all_mentions = []
for sub in subreddits:
results = self.scraper.search_subreddit(
sub, brand_name, sort="new", limit=50
)
for post in results:
sentiment = self._classify_sentiment(
post["title"] + " " + post.get("selftext", "")
)
all_mentions.append({
"subreddit": sub,
"title": post["title"],
"score": post["score"],
"comments": post["num_comments"],
"sentiment": sentiment,
})
# Aggregate results
sentiments = Counter(m["sentiment"] for m in all_mentions)
top_positive = sorted(
[m for m in all_mentions if m["sentiment"] == "positive"],
key=lambda x: x["score"], reverse=True
)[:3]
top_negative = sorted(
[m for m in all_mentions if m["sentiment"] == "negative"],
key=lambda x: x["score"], reverse=True
)[:3]
return {
"brand": brand_name,
"total_mentions": len(all_mentions),
"sentiment_breakdown": dict(sentiments),
"top_positive": top_positive,
"top_negative": top_negative,
"subreddit_distribution": dict(
Counter(m["subreddit"] for m in all_mentions)
),
}
def _classify_sentiment(self, text):
"""Simple keyword-based sentiment classifier."""
text_lower = text.lower()
positive = {"love", "great", "amazing", "best", "awesome", "recommend",
"excellent", "fantastic", "solid", "impressed", "switched to"}
negative = {"hate", "terrible", "worst", "awful", "broken", "scam",
"avoid", "disappointed", "buggy", "overpriced", "switched from"}
pos = sum(1 for w in positive if w in text_lower)
neg = sum(1 for w in negative if w in text_lower)
if pos > neg:
return "positive"
elif neg > pos:
return "negative"
return "neutral"
# Analyze brand sentiment
analyzer = RedditSentimentAnalyzer()
report = analyzer.analyze_brand("Mantis API")
print(f"Brand: {report['brand']}")
print(f"Total mentions: {report['total_mentions']}")
print(f"Sentiment: {report['sentiment_breakdown']}")
print(f"\nTop positive mentions:")
for p in report["top_positive"]:
print(f" [{p['score']}] r/{p['subreddit']}: {p['title'][:60]}")
print(f"\nTop negative mentions:")
for p in report["top_negative"]:
print(f" [{p['score']}] r/{p['subreddit']}: {p['title'][:60]}")
Track competitor mentions, compare engagement, and identify feature requests that your product could address.
const axios = require('axios');
class CompetitorMonitor {
constructor() {
this.client = axios.create({
headers: {
'User-Agent': 'CompetitorMonitor/1.0 (research)'
},
timeout: 15000,
});
}
async compareCompetitors(competitors, subreddits) {
const results = {};
for (const competitor of competitors) {
results[competitor] = {
totalMentions: 0,
totalScore: 0,
avgScore: 0,
topPosts: [],
subreddits: {},
};
for (const sub of subreddits) {
await new Promise(r => setTimeout(r, 6000)); // Rate limit
const url = `https://www.reddit.com/r/${sub}/search.json`;
const { data } = await this.client.get(url, {
params: { q: competitor, restrict_sr: 'on', sort: 'top', limit: 25 }
});
const posts = data.data.children.map(c => ({
title: c.data.title,
score: c.data.score,
comments: c.data.num_comments,
subreddit: sub,
permalink: c.data.permalink,
}));
results[competitor].totalMentions += posts.length;
results[competitor].totalScore += posts.reduce((s, p) => s + p.score, 0);
results[competitor].topPosts.push(...posts);
results[competitor].subreddits[sub] = posts.length;
}
const r = results[competitor];
r.avgScore = r.totalMentions > 0
? Math.round(r.totalScore / r.totalMentions)
: 0;
r.topPosts = r.topPosts
.sort((a, b) => b.score - a.score)
.slice(0, 5);
}
return results;
}
}
// Compare scraping API competitors
(async () => {
const monitor = new CompetitorMonitor();
const results = await monitor.compareCompetitors(
['ScrapingBee', 'Apify', 'Crawlee', 'Bright Data'],
['webdev', 'python', 'node', 'datascience']
);
for (const [name, data] of Object.entries(results)) {
console.log(`\n📊 ${name}:`);
console.log(` Mentions: ${data.totalMentions} | Avg Score: ${data.avgScore}`);
console.log(` Top post: ${data.topPosts[0]?.title.slice(0, 60) || 'N/A'}`);
}
})();
Build a curated knowledge base from Reddit's best answers — perfect for RAG (Retrieval-Augmented Generation) pipelines and AI agent tools.
import requests
import json
from datetime import datetime
class RedditKnowledgeBuilder:
"""Build AI knowledge bases from Reddit's best content."""
def __init__(self):
self.scraper = RedditScraper() # From Method 1
def build_knowledge_base(self, topic, subreddits, min_score=10):
"""Extract high-quality Q&A pairs from Reddit discussions."""
knowledge_base = []
for sub in subreddits:
# Search for relevant discussions
posts = self.scraper.search_subreddit(
sub, topic, sort="top", limit=50
)
for post in posts:
if post["score"] < min_score:
continue
# Fetch comments for high-scoring posts
try:
post_data, comments = self.scraper.get_post_comments(
sub, post["id"]
)
except Exception:
continue
# Extract the best answers (top-scored comments)
best_comments = sorted(
[c for c in comments if c["score"] >= 5],
key=lambda x: x["score"],
reverse=True
)[:3]
if best_comments:
knowledge_base.append({
"question": post["title"],
"context": post.get("selftext", "")[:300],
"answers": [
{
"text": c["body"],
"score": c["score"],
"author": c["author"],
}
for c in best_comments
],
"source_url": f"https://reddit.com{post['permalink']}",
"subreddit": sub,
"post_score": post["score"],
"extracted_at": datetime.utcnow().isoformat(),
})
return knowledge_base
def export_for_rag(self, knowledge_base, output_file="reddit_kb.jsonl"):
"""Export knowledge base as JSONL for RAG ingestion."""
with open(output_file, "w") as f:
for entry in knowledge_base:
# Create a clean document for embedding
best_answer = entry["answers"][0]["text"]
doc = {
"text": f"Question: {entry['question']}\n\n"
f"Best Answer: {best_answer}\n\n"
f"Source: {entry['source_url']}",
"metadata": {
"source": "reddit",
"subreddit": entry["subreddit"],
"score": entry["post_score"],
"answer_score": entry["answers"][0]["score"],
}
}
f.write(json.dumps(doc) + "\n")
print(f"Exported {len(knowledge_base)} entries to {output_file}")
# Build a knowledge base about web scraping
builder = RedditKnowledgeBuilder()
kb = builder.build_knowledge_base(
topic="web scraping best practices",
subreddits=["webdev", "python", "learnprogramming", "datascience"],
min_score=15
)
print(f"Built knowledge base with {len(kb)} entries")
for entry in kb[:3]:
print(f"\nQ: {entry['question'][:80]}")
print(f"A: {entry['answers'][0]['text'][:120]}...")
print(f" Score: {entry['post_score']} | r/{entry['subreddit']}")
# Export for RAG pipeline
builder.export_for_rag(kb)
| Feature | Reddit API (Free) | DIY Scraping | Mantis API |
|---|---|---|---|
| Cost | Free (non-commercial only) | Server + proxy costs | $29/mo (5K requests) |
| Commercial use | Requires paid agreement | Your responsibility | Included |
| Rate limits | 60 req/min (auth), 10 (unauth) | IP-based blocks | Per plan |
| Setup time | Minutes (need credentials) | Hours | Minutes |
| Data format | Structured JSON | JSON or HTML → parse | Structured JSON |
| JS rendering | N/A (API) | Needed for new Reddit | Included |
| Anti-bot handling | N/A (API) | You manage | Included |
| Maintenance | API version updates | DOM changes, rate limits | Zero |
| Historical data | Limited (1000 posts max) | Limited by pagination | Current pages |
| Reliability | High | Medium | High |
Mantis handles proxy rotation, JavaScript rendering, and anti-bot measures. Get structured Reddit data with a single API call.
View Pricing Get Started FreeReddit scraping has become a more complex legal landscape since the 2023 API changes. Here's what you need to know:
Disclaimer: This article is for educational purposes only. Web scraping may violate Reddit's Terms of Service. Always ensure your scraping activities comply with applicable laws and regulations in your jurisdiction.
See the structured FAQ data above for common questions about scraping Reddit. Key points:
Now that you know how to scrape Reddit, explore more scraping guides: