# robots.txt for https://www.whu.edu/
# ====== THROTTLED BOTS (allowed, but slowed down) ======
# OpenAI Searchbot - experimental bot for AI-powered search functions
User-agent: OAI-SearchBot
Crawl-delay: 15
Disallow:
# GoogleOther - used by Google services like Lens or Translate, not related to search index
User-agent: GoogleOther
Crawl-delay: 10
Disallow:
# Meta External Agent - crawls previews for Facebook/Instagram, but can be quite aggressive
User-agent: meta-externalagent
Crawl-delay: 15
Disallow:
# ====== IMPORTANT BOTS - FULLY ALLOWED ======
# Microsoft Bing crawler - important for Copilot, DuckDuckGo etc., but often too aggressive
User-agent: bingbot
Disallow:
# Google Search - primary search crawler
User-agent: Googlebot
Disallow:
# Google Images - image indexing
User-agent: Googlebot-Image
Disallow:
# Google AdsBot - checks landing pages for Google Ads
User-agent: AdsBot-Google
Disallow:
# Facebook preview crawler - used for correct display of shared links
User-agent: facebookexternalhit
Disallow:
# OpenAI GPTBot - crawls for ChatGPT, relevant for AI visibility
User-agent: GPTBot
Disallow:
# PerplexityBot - extremely data-hungry crawler for perplexity.ai (AI search)
User-agent: PerplexityBot
Disallow:
# LinkedIn app preview - relevant for social sharing
User-agent: LinkedInApp
Disallow:
# Applebot - used by Siri and Spotlight for indexing
User-agent: Applebot
Disallow:
# ClaudeBot - Anthropic's Chatbot crawler (ChatGPT competitor), used for AI training
User-agent: ClaudeBot
Disallow: /
# Bytespider - TikTok/ByteDance crawler for content/ad targeting
User-agent: Bytespider
Disallow: /
# ====== BLOCKED BOTS (no benefit, high load) ======
# MJ12bot (Majestic SEO) - outdated, aggressive, no SEO value
User-agent: MJ12bot
Disallow: /
# BLEXBot - structure crawler without SEO relevance
User-agent: BLEXBot
Disallow: /
# DotBot - aggressive SEO crawler with no real benefit
User-agent: DotBot
Disallow: /
# EcoresearchCrawler - academic crawler with heavy resource usage
User-agent: ecoresearchCrawler
Disallow: /
# Barkrowler - experimental ML/AI crawler, adds load
User-agent: Barkrowler
Disallow: /
# Quantbot - unknown origin, undocumented purpose
User-agent: Quantbot
Disallow: /
# Amazonbot - internal Amazon bot (Alexa etc.), not SEO-relevant
User-agent: Amazonbot
Disallow: /
# YandexBot - Russian search engine bot, not required for global SEO
User-agent: YandexBot
Disallow: /
# SeznamBot - Czech search engine bot, not relevant
User-agent: SeznamBot
Disallow: /
# ====== TYPO3 STRUCTURE AND LANGUAGE-SPECIFIC FILTERING ======
User-agent: *
Disallow: /typo3/
Disallow: /typo3conf/
Allow: /typo3/sysext/core/Resources/Public/*
Allow: /typo3conf/ext/*/Resources/Public/*
Disallow: /en/search/
Disallow: /de/suche/
# ====== EVENT OVERVIEW PAGES WITH QUERY STRINGS ======
# These URLs generate excessive variants and duplicate content for bots
User-agent: *
Disallow: /de/news-insights/events/?
Disallow: /en/news-insights/events/?
# ====== MULTILINGUAL SITEMAP DEFINITIONS ======
Sitemap: https://www.whu.edu/de/sitemap.xml
Sitemap: https://www.whu.edu/en/sitemap.xml