Sitemap Pro Advanced

Präzise Crawler-Technologie

Gib die vollständige URL mit https:// ein

0
Gefundene URLs
0
HTTP 200
0
Redirects
0
Fehler

Crawler-Log

Bereit zum Crawlen...

Gefundene URLs 0

Keine URLs gefunden

Starte einen Crawl, um Daten zu sammeln

// server.js const express = require('express'); const cors = require('cors'); const cheerio = require('cheerio'); const axios = require('axios'); const { URL } = require('url'); const rateLimit = require('express-rate-limit'); const helmet = require('helmet'); const compression = require('compression'); const NodeCache = require('node-cache'); const app = express(); const cache = new NodeCache({ stdTTL: 3600 }); // 1 Stunde Cache // Middleware app.use(helmet()); app.use(compression()); app.use(cors({ origin: process.env.FRONTEND_URL || 'http://localhost:3000', credentials: true })); app.use(express.json({ limit: '10mb' })); // Rate Limiting const limiter = rateLimit({ windowMs: 15 * 60 * 1000, // 15 Minuten max: 100 // Limit pro IP }); app.use('/api/', limiter); // Crawler State const activeCrawls = new Map(); // URL Normalisierung function normalizeUrl(url, baseUrl) { try { if (!url) return null; // Relative URLs if (url.startsWith('/')) { const base = new URL(baseUrl); return base.origin + url; } if (url.startsWith('./') || url.startsWith('../') || (!url.match(/^https?:\/\//) && !url.startsWith('//'))) { return new URL(url, baseUrl).href; } if (url.startsWith('//')) { return 'https:' + url; } const parsed = new URL(url); // Nur HTTP/HTTPS if (!['http:', 'https:'].includes(parsed.protocol)) return null; // Tracking-Parameter entfernen const paramsToRemove = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid', 'ref', 'referrer']; paramsToRemove.forEach(param => parsed.searchParams.delete(param)); // Hash entfernen parsed.hash = ''; // Trailing Slash normalisieren (außer Root) let href = parsed.href; if (href.endsWith('/') && parsed.pathname !== '/') { href = href.slice(0, -1); } return href.toLowerCase(); } catch (e) { return null; } } function getDomain(url) { try { return new URL(url).hostname.replace(/^www\./, '').toLowerCase(); } catch (e) { return ''; } } function isSameDomain(url, baseDomain) { return getDomain(url) === baseDomain; } // Robots.txt Parser async function checkRobotsTxt(baseUrl) { try { const domain = new URL(baseUrl); const robotsUrl = `${domain.protocol}//${domain.host}/robots.txt`; const response = await axios.get(robotsUrl, { timeout: 5000 }); const content = response.data; // Einfache Parser-Logik const userAgentMatch = content.match(/User-agent:\s*\*/i); if (!userAgentMatch) return { allowed: true, crawlDelay: 0 }; const section = content.substring(userAgentMatch.index); const disallowMatches = section.match(/Disallow:\s*(.+)/gi) || []; const crawlDelayMatch = section.match(/Crawl-delay:\s*(\d+)/i); return { allowed: true, disallowedPaths: disallowMatches.map(m => m.split(':')[1].trim()), crawlDelay: crawlDelayMatch ? parseInt(crawlDelayMatch[1]) * 1000 : 0 }; } catch (e) { return { allowed: true, crawlDelay: 0 }; } } // Einzelne Seite crawlen async function crawlPage(url, baseDomain, options = {}) { const { timeout = 10000, userAgent = 'Mozilla/5.0 (compatible; SitemapBot/1.0)', followRedirects = true, maxSize = 5 * 1024 * 1024 // 5MB } = options; const cacheKey = `crawl_${url}`; const cached = cache.get(cacheKey); if (cached) return cached; try { const startTime = Date.now(); const response = await axios.get(url, { timeout, maxRedirects: followRedirects ? 5 : 0, maxContentLength: maxSize, headers: { 'User-Agent': userAgent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }, validateStatus: () => true // Alle Status-Codes akzeptieren }); const loadTime = Date.now() - startTime; const contentType = response.headers['content-type'] || ''; const isHtml = contentType.includes('text/html'); const result = { url, status: response.status, statusText: response.statusText, headers: response.headers, contentType, loadTime, timestamp: new Date().toISOString(), size: response.data?.length || 0, isHtml, external: !isSameDomain(url, baseDomain) }; if (isHtml && typeof response.data === 'string') { const $ = cheerio.load(response.data); // Meta-Daten result.title = $('title').text().trim(); result.description = $('meta[name="description"]').attr('content') || ''; result.canonical = $('link[rel="canonical"]').attr('href') || ''; result.robots = $('meta[name="robots"]').attr('content') || ''; result.viewport = $('meta[name="viewport"]').attr('content') || ''; result.lang = $('html').attr('lang') || ''; // Open Graph result.ogTitle = $('meta[property="og:title"]').attr('content') || ''; result.ogDescription = $('meta[property="og:description"]').attr('content') || ''; result.ogImage = $('meta[property="og:image"]').attr('content') || ''; // Links extrahieren const links = new Set(); const internalLinks = []; const externalLinks = []; const resources = []; // href Links $('a[href]').each((_, el) => { const href = $(el).attr('href'); const normalized = normalizeUrl(href, url); if (normalized) { const isExt = !isSameDomain(normalized, baseDomain); const linkData = { url: normalized, text: $(el).text().trim().substring(0, 100), title: $(el).attr('title') || '', isExternal: isExt, isNofollow: $(el).attr('rel')?.includes('nofollow') || false }; if (isExt) externalLinks.push(linkData); else internalLinks.push(linkData); links.add(normalized); } }); // Bilder $('img[src]').each((_, el) => { const src = $(el).attr('src'); const normalized = normalizeUrl(src, url); if (normalized) { resources.push({ type: 'image', url: normalized, alt: $(el).attr('alt') || '', width: $(el).attr('width') || '', height: $(el).attr('height') || '' }); } }); // Scripts $('script[src]').each((_, el) => { const src = $(el).attr('src'); const normalized = normalizeUrl(src, url); if (normalized) { resources.push({ type: 'script', url: normalized }); } }); // Stylesheets $('link[rel="stylesheet"]').each((_, el) => { const href = $(el).attr('href'); const normalized = normalizeUrl(href, url); if (normalized) { resources.push({ type: 'css', url: normalized }); } }); // JSON-LD Structured Data const structuredData = []; $('script[type="application/ld+json"]').each((_, el) => { try { const data = JSON.parse($(el).html()); structuredData.push(data); } catch (e) {} }); // Überschriften-Struktur const headings = {}; ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].forEach(tag => { headings[tag] = $(tag).map((_, el) => $(el).text().trim()).get(); }); result.links = { internal: internalLinks, external: externalLinks, total: links.size }; result.resources = resources; result.structuredData = structuredData; result.headings = headings; result.wordCount = $('body').text().trim().split(/\s+/).length; } cache.set(cacheKey, result); return result; } catch (error) { return { url, status: 0, statusText: error.message, error: true, errorType: error.code || 'UNKNOWN', external: !isSameDomain(url, baseDomain), timestamp: new Date().toISOString() }; } } // Crawl Job verarbeiten async function processCrawlJob(jobId, startUrl, options) { const { maxUrls = 100, maxDepth = 3, delay = 500, respectRobots = true } = options; const baseDomain = getDomain(startUrl); const crawled = new Map(); const queue = [{ url: startUrl, depth: 0, source: 'start' }]; let robotsRules = { allowed: true, crawlDelay: 0 }; if (respectRobots) { robotsRules = await checkRobotsTxt(startUrl); await new Promise(r => setTimeout(r, robotsRules.crawlDelay)); } activeCrawls.set(jobId, { status: 'running', progress: 0, urlsFound: 0, currentUrl: startUrl, startTime: Date.now() }); while (queue.length > 0 && crawled.size < maxUrls) { const { url, depth } = queue.shift(); if (crawled.has(url) || depth > maxDepth) continue; // Robots.txt prüfen if (respectRobots && robotsRules.disallowedPaths) { const path = new URL(url).pathname; const isDisallowed = robotsRules.disallowedPaths.some(disallowed => path.startsWith(disallowed) ); if (isDisallowed) { crawled.set(url, { url, status: 'blocked', reason: 'robots.txt' }); continue; } } // Aktualisiere Status activeCrawls.set(jobId, { ...activeCrawls.get(jobId), currentUrl: url, urlsFound: crawled.size }); // Crawle Seite const result = await crawlPage(url, baseDomain, options); crawled.set(url, { ...result, depth }); // Neue URLs zur Queue hinzufügen if (result.links && depth < maxDepth) { result.links.internal.forEach(link => { if (!crawled.has(link.url) && !queue.find(q => q.url === link.url)) { queue.push({ url: link.url, depth: depth + 1, source: url }); } }); } // Delay if (delay > 0) { await new Promise(r => setTimeout(r, delay)); } } const finalResult = { jobId, status: 'completed', startUrl, baseDomain, totalUrls: crawled.size, duration: Date.now() - activeCrawls.get(jobId).startTime, urls: Array.from(crawled.values()), stats: { success: Array.from(crawled.values()).filter(u => u.status === 200).length, redirects: Array.from(crawled.values()).filter(u => u.status >= 300 && u.status < 400).length, errors: Array.from(crawled.values()).filter(u => u.status >= 400 || u.status === 0).length, blocked: Array.from(crawled.values()).filter(u => u.status === 'blocked').length } }; activeCrawls.set(jobId, { ...activeCrawls.get(jobId), status: 'completed', result: finalResult }); // Cleanup nach 1 Stunde setTimeout(() => activeCrawls.delete(jobId), 3600000); return finalResult; } // API Routes // Crawl starten app.post('/api/crawl', async (req, res) => { const { url, options = {} } = req.body; if (!url || !url.match(/^https?:\/\//)) { return res.status(400).json({ error: 'Ungültige URL' }); } const jobId = `crawl_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; // Async starten processCrawlJob(jobId, url, options); res.json({ jobId, status: 'started', message: 'Crawl gestartet' }); }); // Crawl Status abfragen app.get('/api/crawl/:jobId/status', (req, res) => { const job = activeCrawls.get(req.params.jobId); if (!job) { return res.status(404).json({ error: 'Job nicht gefunden' }); } res.json(job); }); // Crawl Ergebnis abrufen app.get('/api/crawl/:jobId/result', (req, res) => { const job = activeCrawls.get(req.params.jobId); if (!job) { return res.status(404).json({ error: 'Job nicht gefunden' }); } if (job.status !== 'completed') { return res.status(202).json({ status: job.status, message: 'Noch in Bearbeitung' }); } res.json(job.result); }); // Einzelne URL prüfen app.post('/api/check-url', async (req, res) => { const { url, baseUrl } = req.body; const baseDomain = baseUrl ? getDomain(baseUrl) : getDomain(url); try { const result = await crawlPage(url, baseDomain); res.json(result); } catch (error) { res.status(500).json({ error: error.message }); } }); // Sitemap generieren app.post('/api/sitemap', async (req, res) => { const { urls } = req.body; if (!Array.isArray(urls)) { return res.status(400).json({ error: 'URLs Array erforderlich' }); } const date = new Date().toISOString(); const xml = ` ${urls.map(u => ` ${u.url} ${u.lastmod || date.split('T')[0]} ${u.changefreq || 'weekly'} ${u.priority || '0.5'} `).join('\n')} `; res.set('Content-Type', 'application/xml'); res.send(xml); }); // Health Check app.get('/api/health', (req, res) => { res.json({ status: 'ok', activeJobs: activeCrawls.size, uptime: process.uptime() }); }); // Error Handler app.use((err, req, res, next) => { console.error(err.stack); res.status(500).json({ error: 'Interner Serverfehler' }); }); const PORT = process.env.PORT || 3001; app.listen(PORT, () => { console.log(`🚀 Crawler Proxy läuft auf Port ${PORT}`); console.log(`📊 Health Check: http://localhost:${PORT}/api/health`); });