-
Notifications
You must be signed in to change notification settings - Fork 276
Large Scale Sitemaps
Rumen Damyanov edited this page Jul 29, 2025
·
1 revision
Handle millions of URLs efficiently with optimized memory usage, chunking strategies, and automated generation. This guide shows how to scale sitemap generation for massive websites.
- Memory Limits: PHP memory exhaustion with millions of URLs
- URL Limits: 50,000 URLs max per sitemap file
- File Size: 50MB max per sitemap (uncompressed)
- Generation Time: Long execution times
- Server Resources: CPU and I/O intensive operations
<?php
use Rumenx\Sitemap\Sitemap;
class LargeScaleSitemapGenerator
{
private $baseUrl;
private $outputDir;
private $pdo;
private $chunkSize = 50000; // Max URLs per sitemap
public function __construct($baseUrl, $outputDir, $dbConfig)
{
$this->baseUrl = rtrim($baseUrl, '/');
$this->outputDir = rtrim($outputDir, '/') . '/';
$this->pdo = new PDO(
"mysql:host={$dbConfig['host']};dbname={$dbConfig['name']}",
$dbConfig['user'],
$dbConfig['pass'],
[PDO::MYSQL_ATTR_USE_BUFFERED_QUERY => false] // Unbuffered for memory efficiency
);
if (!is_dir($this->outputDir)) {
mkdir($this->outputDir, 0755, true);
}
}
public function generateLargeProductSitemaps()
{
echo "Starting large-scale product sitemap generation...\n";
// Get total count
$countStmt = $this->pdo->query("SELECT COUNT(*) as total FROM products WHERE active = 1");
$totalProducts = $countStmt->fetch(PDO::FETCH_ASSOC)['total'];
echo "Total products: {$totalProducts}\n";
$sitemapCounter = 0;
$urlCounter = 0;
$sitemapIndex = new Sitemap();
$currentSitemap = new Sitemap();
// Process products in chunks to avoid memory issues
$limit = 1000; // Process 1000 at a time
$offset = 0;
while ($offset < $totalProducts) {
echo "Processing products {$offset} to " . ($offset + $limit) . "\n";
$stmt = $this->pdo->prepare("
SELECT slug, updated_at
FROM products
WHERE active = 1
ORDER BY id
LIMIT :limit OFFSET :offset
");
$stmt->bindValue(':limit', $limit, PDO::PARAM_INT);
$stmt->bindValue(':offset', $offset, PDO::PARAM_INT);
$stmt->execute();
while ($product = $stmt->fetch(PDO::FETCH_ASSOC)) {
if ($urlCounter >= $this->chunkSize) {
// Save current sitemap and start new one
$filename = "sitemap-products-{$sitemapCounter}.xml";
$this->saveSitemap($currentSitemap, $filename);
// Add to index
$sitemapIndex->addSitemap(
"{$this->baseUrl}/{$filename}",
date('c')
);
echo "Generated {$filename} with {$urlCounter} URLs\n";
// Reset for next sitemap
$currentSitemap = new Sitemap();
$urlCounter = 0;
$sitemapCounter++;
}
$currentSitemap->add(
"{$this->baseUrl}/products/{$product['slug']}",
date('c', strtotime($product['updated_at'])),
'0.8',
'weekly'
);
$urlCounter++;
}
$offset += $limit;
// Free memory
$stmt = null;
// Optional: garbage collection
if ($offset % 10000 === 0) {
gc_collect_cycles();
}
}
// Handle remaining URLs
if ($urlCounter > 0) {
$filename = "sitemap-products-{$sitemapCounter}.xml";
$this->saveSitemap($currentSitemap, $filename);
$sitemapIndex->addSitemap(
"{$this->baseUrl}/{$filename}",
date('c')
);
echo "Generated {$filename} with {$urlCounter} URLs\n";
}
// Generate sitemap index
$this->generateSitemapIndex($sitemapIndex, 'sitemap-products-index.xml');
echo "Generated sitemap index for {$totalProducts} products in " . ($sitemapCounter + 1) . " files\n";
}
private function saveSitemap($sitemap, $filename)
{
$xml = $sitemap->renderXml();
file_put_contents($this->outputDir . $filename, $xml);
// Clear memory
$sitemap = null;
$xml = null;
}
private function generateSitemapIndex($sitemapIndex, $filename)
{
$items = $sitemapIndex->getModel()->getSitemaps();
$xml = view('sitemap.sitemapindex', compact('items'))->render();
file_put_contents($this->outputDir . $filename, $xml);
}
}
// Usage
$config = [
'base_url' => 'https://example.com',
'output_dir' => '/var/www/html/public/sitemaps/',
'database' => [
'host' => 'localhost',
'name' => 'yourdb',
'user' => 'dbuser',
'pass' => 'dbpass'
]
];
$generator = new LargeScaleSitemapGenerator(
$config['base_url'],
$config['output_dir'],
$config['database']
);
$generator->generateLargeProductSitemaps();
<?php
use Rumenx\Sitemap\Sitemap;
class MultiTableSitemapGenerator
{
private $baseUrl;
private $outputDir;
private $pdo;
private $chunkSize = 45000; // Leave room for other URLs
public function __construct($baseUrl, $outputDir, $dbConfig)
{
$this->baseUrl = rtrim($baseUrl, '/');
$this->outputDir = rtrim($outputDir, '/') . '/';
$this->pdo = new PDO(
"mysql:host={$dbConfig['host']};dbname={$dbConfig['name']}",
$dbConfig['user'],
$dbConfig['pass'],
[
PDO::MYSQL_ATTR_USE_BUFFERED_QUERY => false,
PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION
]
);
if (!is_dir($this->outputDir)) {
mkdir($this->outputDir, 0755, true);
}
}
public function generateAllSitemaps()
{
$masterIndex = new Sitemap();
// Generate sitemaps for each content type
$contentTypes = [
'posts' => $this->generateContentSitemaps('posts', 'blog'),
'products' => $this->generateContentSitemaps('products', 'products'),
'categories' => $this->generateContentSitemaps('categories', 'categories'),
'pages' => $this->generateContentSitemaps('pages', 'pages')
];
// Add all content type indexes to master index
foreach ($contentTypes as $type => $indexFile) {
if ($indexFile) {
$masterIndex->addSitemap(
"{$this->baseUrl}/{$indexFile}",
date('c')
);
}
}
// Generate master sitemap index
$this->generateSitemapIndex($masterIndex, 'sitemap.xml');
echo "Master sitemap index generated: sitemap.xml\n";
}
private function generateContentSitemaps($table, $urlPrefix)
{
echo "Generating sitemaps for {$table}...\n";
// Get total count
$whereClause = $this->getWhereClause($table);
$countStmt = $this->pdo->query("SELECT COUNT(*) as total FROM {$table} WHERE {$whereClause}");
$totalItems = $countStmt->fetch(PDO::FETCH_ASSOC)['total'];
if ($totalItems === 0) {
echo "No items found for {$table}\n";
return null;
}
echo "Total {$table}: {$totalItems}\n";
$sitemapCounter = 0;
$urlCounter = 0;
$contentIndex = new Sitemap();
$currentSitemap = new Sitemap();
$limit = 1000;
$offset = 0;
while ($offset < $totalItems) {
$stmt = $this->pdo->prepare("
SELECT slug, updated_at, priority
FROM {$table}
WHERE {$whereClause}
ORDER BY id
LIMIT :limit OFFSET :offset
");
$stmt->bindValue(':limit', $limit, PDO::PARAM_INT);
$stmt->bindValue(':offset', $offset, PDO::PARAM_INT);
$stmt->execute();
while ($item = $stmt->fetch(PDO::FETCH_ASSOC)) {
if ($urlCounter >= $this->chunkSize) {
// Save current sitemap
$filename = "sitemap-{$table}-{$sitemapCounter}.xml";
$this->saveSitemap($currentSitemap, $filename);
// Add to content index
$contentIndex->addSitemap(
"{$this->baseUrl}/{$filename}",
date('c')
);
echo "Generated {$filename} with {$urlCounter} URLs\n";
// Reset
$currentSitemap = new Sitemap();
$urlCounter = 0;
$sitemapCounter++;
}
$priority = $this->getPriorityForTable($table, $item);
$frequency = $this->getFrequencyForTable($table);
$currentSitemap->add(
"{$this->baseUrl}/{$urlPrefix}/{$item['slug']}",
date('c', strtotime($item['updated_at'])),
$priority,
$frequency
);
$urlCounter++;
}
$offset += $limit;
$stmt = null;
// Memory management
if ($offset % 50000 === 0) {
gc_collect_cycles();
echo "Memory usage: " . memory_get_usage(true) / 1024 / 1024 . " MB\n";
}
}
// Handle remaining URLs
if ($urlCounter > 0) {
$filename = "sitemap-{$table}-{$sitemapCounter}.xml";
$this->saveSitemap($currentSitemap, $filename);
$contentIndex->addSitemap(
"{$this->baseUrl}/{$filename}",
date('c')
);
echo "Generated {$filename} with {$urlCounter} URLs\n";
}
// Generate content type index if multiple files
if ($sitemapCounter > 0) {
$indexFilename = "sitemap-{$table}-index.xml";
$this->generateSitemapIndex($contentIndex, $indexFilename);
echo "Generated index for {$table}: {$indexFilename}\n";
return $indexFilename;
} else {
// Only one file, use it directly
return "sitemap-{$table}-0.xml";
}
}
private function getWhereClause($table)
{
switch ($table) {
case 'posts':
return 'published = 1';
case 'products':
return 'active = 1';
case 'categories':
return 'active = 1';
case 'pages':
return 'published = 1';
default:
return '1=1';
}
}
private function getPriorityForTable($table, $item)
{
if (isset($item['priority'])) {
return $item['priority'];
}
switch ($table) {
case 'posts': return '0.7';
case 'products': return '0.8';
case 'categories': return '0.6';
case 'pages': return '0.8';
default: return '0.5';
}
}
private function getFrequencyForTable($table)
{
switch ($table) {
case 'posts': return 'monthly';
case 'products': return 'weekly';
case 'categories': return 'monthly';
case 'pages': return 'monthly';
default: return 'monthly';
}
}
private function saveSitemap($sitemap, $filename)
{
$xml = $sitemap->renderXml();
file_put_contents($this->outputDir . $filename, $xml);
// Clear memory
unset($sitemap, $xml);
}
private function generateSitemapIndex($sitemapIndex, $filename)
{
$items = $sitemapIndex->getModel()->getSitemaps();
$xml = view('sitemap.sitemapindex', compact('items'))->render();
file_put_contents($this->outputDir . $filename, $xml);
}
}
<?php
use Rumenx\Sitemap\Sitemap;
class StreamingSitemapGenerator
{
private $baseUrl;
private $outputDir;
private $pdo;
public function __construct($baseUrl, $outputDir, $dbConfig)
{
$this->baseUrl = rtrim($baseUrl, '/');
$this->outputDir = rtrim($outputDir, '/') . '/';
$this->pdo = new PDO(
"mysql:host={$dbConfig['host']};dbname={$dbConfig['name']}",
$dbConfig['user'],
$dbConfig['pass'],
[PDO::MYSQL_ATTR_USE_BUFFERED_QUERY => false]
);
if (!is_dir($this->outputDir)) {
mkdir($this->outputDir, 0755, true);
}
}
public function generateStreamingSitemap($table, $urlPrefix)
{
$sitemapIndex = new Sitemap();
$sitemapCounter = 0;
foreach ($this->getContentStream($table) as $chunk) {
if (empty($chunk)) continue;
$sitemap = new Sitemap();
foreach ($chunk as $item) {
$sitemap->add(
"{$this->baseUrl}/{$urlPrefix}/{$item['slug']}",
date('c', strtotime($item['updated_at'])),
$item['priority'] ?? '0.7',
'monthly'
);
}
$filename = "sitemap-{$table}-{$sitemapCounter}.xml";
$xml = $sitemap->renderXml();
file_put_contents($this->outputDir . $filename, $xml);
$sitemapIndex->addSitemap(
"{$this->baseUrl}/{$filename}",
date('c')
);
echo "Generated {$filename} with " . count($chunk) . " URLs\n";
// Clear memory
unset($sitemap, $xml, $chunk);
gc_collect_cycles();
$sitemapCounter++;
}
// Generate index
if ($sitemapCounter > 0) {
$indexFilename = "sitemap-{$table}-index.xml";
$this->generateSitemapIndex($sitemapIndex, $indexFilename);
echo "Generated {$indexFilename}\n";
}
}
private function getContentStream($table, $chunkSize = 50000)
{
$offset = 0;
$batchSize = 1000;
while (true) {
$stmt = $this->pdo->prepare("
SELECT slug, updated_at, priority
FROM {$table}
WHERE " . $this->getWhereClause($table) . "
ORDER BY id
LIMIT :batch_size OFFSET :offset
");
$stmt->bindValue(':batch_size', $batchSize, PDO::PARAM_INT);
$stmt->bindValue(':offset', $offset, PDO::PARAM_INT);
$stmt->execute();
$batch = $stmt->fetchAll(PDO::FETCH_ASSOC);
if (empty($batch)) {
break; // No more data
}
// Yield chunks of specified size
static $currentChunk = [];
$currentChunk = array_merge($currentChunk, $batch);
while (count($currentChunk) >= $chunkSize) {
yield array_splice($currentChunk, 0, $chunkSize);
}
$offset += $batchSize;
}
// Yield remaining items
if (!empty($currentChunk)) {
yield $currentChunk;
}
}
private function getWhereClause($table)
{
switch ($table) {
case 'posts': return 'published = 1';
case 'products': return 'active = 1';
default: return '1=1';
}
}
private function generateSitemapIndex($sitemapIndex, $filename)
{
$items = $sitemapIndex->getModel()->getSitemaps();
$xml = view('sitemap.sitemapindex', compact('items'))->render();
file_put_contents($this->outputDir . $filename, $xml);
}
}
// Usage
$generator = new StreamingSitemapGenerator($baseUrl, $outputDir, $dbConfig);
$generator->generateStreamingSitemap('products', 'products');
<?php
use Rumenx\Sitemap\Sitemap;
class ParallelSitemapGenerator
{
private $baseUrl;
private $outputDir;
private $dbConfig;
private $maxProcesses = 4;
public function __construct($baseUrl, $outputDir, $dbConfig)
{
$this->baseUrl = rtrim($baseUrl, '/');
$this->outputDir = rtrim($outputDir, '/') . '/';
$this->dbConfig = $dbConfig;
if (!is_dir($this->outputDir)) {
mkdir($this->outputDir, 0755, true);
}
}
public function generateParallelSitemaps($table, $urlPrefix)
{
// Get total count and calculate ranges
$pdo = new PDO(
"mysql:host={$this->dbConfig['host']};dbname={$this->dbConfig['name']}",
$this->dbConfig['user'],
$this->dbConfig['pass']
);
$stmt = $pdo->query("SELECT COUNT(*) as total FROM {$table} WHERE active = 1");
$total = $stmt->fetch(PDO::FETCH_ASSOC)['total'];
$chunkSize = ceil($total / $this->maxProcesses);
$processes = [];
echo "Generating {$table} sitemaps in {$this->maxProcesses} parallel processes...\n";
echo "Total items: {$total}, chunk size: {$chunkSize}\n";
// Start processes
for ($i = 0; $i < $this->maxProcesses; $i++) {
$offset = $i * $chunkSize;
$limit = min($chunkSize, $total - $offset);
if ($limit <= 0) break;
$cmd = sprintf(
'php %s --table=%s --url-prefix=%s --offset=%d --limit=%d --process=%d',
__DIR__ . '/generate-sitemap-chunk.php',
escapeshellarg($table),
escapeshellarg($urlPrefix),
$offset,
$limit,
$i
);
$process = proc_open($cmd, [], $pipes);
$processes[] = $process;
echo "Started process {$i}: offset {$offset}, limit {$limit}\n";
}
// Wait for all processes to complete
foreach ($processes as $i => $process) {
$status = proc_close($process);
echo "Process {$i} completed with status {$status}\n";
}
// Combine results into index
$this->createCombinedIndex($table);
}
private function createCombinedIndex($table)
{
$sitemapIndex = new Sitemap();
// Find all generated chunk files
$pattern = $this->outputDir . "sitemap-{$table}-chunk-*.xml";
$files = glob($pattern);
foreach ($files as $file) {
$filename = basename($file);
$sitemapIndex->addSitemap(
"{$this->baseUrl}/{$filename}",
date('c', filemtime($file))
);
}
// Generate index
$indexFilename = "sitemap-{$table}-index.xml";
$this->generateSitemapIndex($sitemapIndex, $indexFilename);
echo "Generated combined index: {$indexFilename}\n";
}
private function generateSitemapIndex($sitemapIndex, $filename)
{
$items = $sitemapIndex->getModel()->getSitemaps();
$xml = view('sitemap.sitemapindex', compact('items'))->render();
file_put_contents($this->outputDir . $filename, $xml);
}
}
#!/usr/bin/env php
<?php
/**
* Generate sitemap chunk for parallel processing
*/
require 'vendor/autoload.php';
use Rumenx\Sitemap\Sitemap;
// Parse command line arguments
$options = getopt('', [
'table:',
'url-prefix:',
'offset:',
'limit:',
'process:'
]);
$table = $options['table'];
$urlPrefix = $options['url-prefix'];
$offset = (int)$options['offset'];
$limit = (int)$options['limit'];
$processId = (int)$options['process'];
// Database configuration (you might want to load this from config)
$dbConfig = [
'host' => 'localhost',
'name' => 'yourdb',
'user' => 'dbuser',
'pass' => 'dbpass'
];
$baseUrl = 'https://example.com';
$outputDir = '/path/to/output/';
try {
$pdo = new PDO(
"mysql:host={$dbConfig['host']};dbname={$dbConfig['name']}",
$dbConfig['user'],
$dbConfig['pass']
);
$sitemap = new Sitemap();
$stmt = $pdo->prepare("
SELECT slug, updated_at
FROM {$table}
WHERE active = 1
ORDER BY id
LIMIT :limit OFFSET :offset
");
$stmt->bindValue(':limit', $limit, PDO::PARAM_INT);
$stmt->bindValue(':offset', $offset, PDO::PARAM_INT);
$stmt->execute();
$count = 0;
while ($item = $stmt->fetch(PDO::FETCH_ASSOC)) {
$sitemap->add(
"{$baseUrl}/{$urlPrefix}/{$item['slug']}",
date('c', strtotime($item['updated_at'])),
'0.8',
'weekly'
);
$count++;
}
// Save chunk file
$filename = "sitemap-{$table}-chunk-{$processId}.xml";
$xml = $sitemap->renderXml();
file_put_contents($outputDir . $filename, $xml);
echo "Process {$processId}: Generated {$filename} with {$count} URLs\n";
} catch (Exception $e) {
echo "Process {$processId} error: " . $e->getMessage() . "\n";
exit(1);
}
<?php
use Rumenx\Sitemap\Sitemap;
class MemoryOptimizedGenerator
{
private $maxMemoryMB = 128; // Maximum memory usage in MB
private $checkInterval = 1000; // Check memory every N URLs
private $itemCount = 0;
public function generateWithMemoryLimit($table)
{
$sitemap = new Sitemap();
$sitemapIndex = new Sitemap();
$sitemapCounter = 0;
$pdo = new PDO(...); // Your DB connection
$stmt = $pdo->prepare("SELECT slug, updated_at FROM {$table} WHERE active = 1");
$stmt->execute();
while ($item = $stmt->fetch(PDO::FETCH_ASSOC)) {
$sitemap->add(
"https://example.com/{$table}/{$item['slug']}",
date('c', strtotime($item['updated_at'])),
'0.7',
'monthly'
);
$this->itemCount++;
// Check memory usage periodically
if ($this->itemCount % $this->checkInterval === 0) {
$memoryMB = memory_get_usage(true) / 1024 / 1024;
echo "Memory usage: {$memoryMB} MB (items: {$this->itemCount})\n";
if ($memoryMB > $this->maxMemoryMB) {
// Save current sitemap and start fresh
$filename = "sitemap-{$table}-{$sitemapCounter}.xml";
$xml = $sitemap->renderXml();
file_put_contents($filename, $xml);
$sitemapIndex->addSitemap("https://example.com/{$filename}", date('c'));
echo "Saved {$filename} due to memory limit\n";
// Clean up
unset($sitemap, $xml);
gc_collect_cycles();
// Start new sitemap
$sitemap = new Sitemap();
$sitemapCounter++;
echo "Memory after cleanup: " . (memory_get_usage(true) / 1024 / 1024) . " MB\n";
}
}
}
// Save final sitemap
if ($this->itemCount > 0) {
$filename = "sitemap-{$table}-{$sitemapCounter}.xml";
$xml = $sitemap->renderXml();
file_put_contents($filename, $xml);
$sitemapIndex->addSitemap("https://example.com/{$filename}", date('c'));
echo "Saved final {$filename}\n";
}
// Generate index
$this->generateSitemapIndex($sitemapIndex, "sitemap-{$table}-index.xml");
}
private function generateSitemapIndex($sitemapIndex, $filename)
{
$items = $sitemapIndex->getModel()->getSitemaps();
$xml = view('sitemap.sitemapindex', compact('items'))->render();
file_put_contents($filename, $xml);
}
}
-
Database Optimization
- Use proper indexes on frequently queried columns
- Consider read replicas for large datasets
- Use
LIMIT
andOFFSET
for pagination - Avoid
SELECT *
- only fetch needed columns
-
Memory Management
- Use unbuffered queries:
PDO::MYSQL_ATTR_USE_BUFFERED_QUERY => false
- Call
gc_collect_cycles()
periodically - Unset large variables when done
- Monitor memory usage with
memory_get_usage()
- Use unbuffered queries:
-
File I/O Optimization
- Write files in chunks
- Use efficient file paths
- Consider using streams for very large files
- Implement proper error handling
-
Scaling Strategies
- Use queue systems for background processing
- Implement intelligent caching
- Consider cloud storage for sitemap files
- Use CDN for sitemap delivery
- Explore Memory Optimization for detailed memory management
- Check Automated Generation for scheduling strategies
- See Caching Strategies for performance optimization
- Learn about Framework Integration for Laravel/Symfony patterns