Sitemap: https://www.philipkingsley.com/media/sitemap/sitemap_us.xml
# ================================================================
# AI CRAWLER WHITELIST CONFIGURATION
# This file allows major LLM crawlers while maintaining SEO access
# ================================================================
# ——— OPENAI (ChatGPT) ———
# Model training crawler for GPT series
User-agent: GPTBot
Allow: /
# ChatGPT search engine integration
User-agent: OAI-SearchBot
Allow: /
# User-driven browsing from ChatGPT and Custom GPTs
User-agent: ChatGPT-User
Allow: /
# ——— ANTHROPIC (Claude) ———
# Bulk model training for Claude
User-agent: anthropic-ai
Allow: /
# Chat citation fetch
User-agent: ClaudeBot
Allow: /
# Claude search integration
User-agent: Claude-SearchBot
Allow: /
# Claude user-driven browsing
User-agent: Claude-User
Allow: /
# ——— GOOGLE (Gemini) ———
# Gemini AI model training and inference
User-agent: Google-Extended
Allow: /
# Gemini-specific deep research crawler
User-agent: Gemini-Deep-Research
Allow: /
# ——— MICROSOFT (Copilot) ———
# Bing crawler used by Microsoft Copilot
User-agent: BingBot
Allow: /
# ——— PERPLEXITY AI ———
# Perplexity index builder
User-agent: PerplexityBot
Allow: /
# Human-triggered visits from Perplexity
User-agent: Perplexity-User
Allow: /
# ——— AMAZON (Rufus) ———
# Amazon's crawler for Alexa and Rufus
User-agent: Amazonbot
Allow: /
# ——— META ———
# Meta's external agent for data collection and AI training
User-agent: Meta-ExternalAgent
Allow: /
# Facebook crawler
User-agent: Facebot
Allow: /
# Meta's external fetcher
User-agent: Meta-ExternalFetcher
Allow: /
# ——— BYTEDANCE (TikTok) ———
# ByteDance's web spider for TikTok-related AI models
User-agent: Bytespider
Allow: /
# ——— YOU.COM ———
# You.com AI search bot
User-agent: YouBot
Allow: /
# ——— BRAVE SEARCH ———
# Note: Brave uses Googlebot user-agent, so control via IP or Cloudflare rules
# The Bravebot identifier may be visible in logs as a derivative of Googlebot
# ——— APPLE ———
# Apple's crawler for Siri and general AI development
User-agent: Applebot
Allow: /
# Apple's AI-specific crawling
User-agent: Applebot-Extended
Allow: /
# ——— COMMON CRAWL (CCBot) ———
# Common Crawl foundation - data used by many LLMs
User-agent: CCBot
Allow: /
# ——— xAI (Grok) ———
User-agent: xAI-Bot
Allow: /
User-agent: GrokBot
Allow: /
# ——— MISTRAL AI ———
User-agent: MistralBot
Allow: /
# ================================================================
# GENERIC BLACKLIST CONFIGURATION
# ================================================================
User-agent: *
# Directories
Disallow: /app/
Disallow: /bin/
Disallow: /dev/
Disallow: /lib/
Disallow: /phpserver/
Disallow: /pkginfo/
Disallow: /report/
Disallow: /setup/
Disallow: /update/
Disallow: /var/
Disallow: /vendor/
# Paths (clean URLs)
Disallow: /index.php/
Disallow: /catalog/product_compare/
Disallow: /catalog/category/view/
Disallow: /catalog/product/view/
Disallow: /catalogsearch/
Disallow: /checkout/
Disallow: /control/
Disallow: /contacts/
Disallow: /customer/
Disallow: /customize/
Disallow: /newsletter/
Disallow: /review/
Disallow: /sendfriend/
Disallow: /wishlist/
# Files
Disallow: /composer.json
Disallow: /composer.lock
Disallow: /CONTRIBUTING.md
Disallow: /CONTRIBUTOR_LICENSE_AGREEMENT.html
Disallow: /COPYING.txt
Disallow: /Gruntfile.js
Disallow: /LICENSE.txt
Disallow: /LICENSE_AFL.txt
Disallow: /nginx.conf.sample
Disallow: /package.json
Disallow: /php.ini.sample
Disallow: /RELEASE_NOTES.txt
# Do not index pages that are sorted or filtered.
Disallow: /*?*product_list_mode=
Disallow: /*?*product_list_order=
Disallow: /*?*product_list_limit=
Disallow: /*?*product_list_dir=
# Do not index session ID
Disallow: /*?SID=
# Disallow: /*?
Disallow: /*.php$
# CVS, SVN directory and dump files
Disallow: /*.CVS
Disallow: /*.Zip$
Disallow: /*.Svn$
Disallow: /*.Idea$
Disallow: /*.Sql$
Disallow: /*.Tgz$