User-agent: *
Disallow: /rss_feeds
# =========================
# Explicitly Allowed Agents
# =========================
# OpenAI Search Bot is explicitly allowed to access all content
User-agent: OAI-SearchBot
Allow: /
# =========================
# AI / LLM-Related Crawlers (Disallowed)
# =========================
# Common Crawl robot, the resulting dataset is often used for LLM training.
User-agent: CCBot
Disallow: /
# ChatGPT robot, used to improve ChatGPT LLM.
User-agent: ChatGPT-User
Disallow: /
# ChatGPT/2.0 robot agent replacing version 1.0, used to improve ChatGPT LLM.
User-agent: ChatGPT-User/2.0
Disallow: /
# ChatGPT robot, may be used to improve ChatGPT LLM.
User-agent: GPTBot
Disallow: /
# Robot used to improve Bard and Vertex AI LLMs.
User-agent: Google-Extended
Disallow: /
# Used by webz.io; their datasets are frequently used to train LLMs.
User-agent: omgili
Disallow: /
# Used by webz.io; their datasets are frequently used to train LLMs.
User-agent: omgilibot
Disallow: /
# FacebookBot crawls public web pages, which can feed Facebook’s LLM efforts.
# Note: Official Meta crawler that generates preview snippets and link metadata for Facebook and Instagram.
User-agent: FacebookBot
Disallow: /
# Amazonbot is used to train Amazon services such as Alexa.
User-agent: Amazonbot
Disallow: /
# Bytespider is ByteDance's bot (TikTok); may not respect robots.txt but is known for AI/ML data gathering.
User-agent: Bytespider
Disallow: /
# Robot used to improve Anthropic AI LLMs.
User-agent: anthropic-ai
Disallow: /
# Additional known AI/LLM bots
User-agent: AI2Bot
Disallow: /
User-agent: Applebot-Extended
Disallow: /
User-agent: Claude-Web
Disallow: /
User-agent: ClaudeBot
Disallow: /
User-agent: cohere-ai
Disallow: /
User-agent: cohere-training-data-crawler
Disallow: /
User-agent: Diffbot
Disallow: /
User-agent: Kangaroo Bot
Disallow: /
User-agent: Meta-ExternalAgent
Disallow: /
User-agent: PanguBot
Disallow: /
User-agent: Timpibot
Disallow: /
User-agent: Webzio-Extended
Disallow: /
User-agent: grapeshot
Disallow: /
#
# New added
#
# Ai2Bot-Dolma is operated by Ai2, a non-profit AI research institute. It's used to download data to train open source AI models. TODO Allow or Disallow
User-agent: Ai2Bot-Dolma
Disallow: /
# Crawlspace is a web crawler developed by Crawlspace Inc., designed to systematically browse and index web content for various applications, including data analysis, search engine optimization, and market research. TODO Allow or Disallow
User-agent: Crawlspace
Disallow: /
# FriendlyCrawler is a web crawler designed to index and analyze website content for various purposes, such as improving search engine results or gathering data for analytics. TODO Allow or Disallow
User-agent: FriendlyCrawler
Disallow: /
# The iaskspider crawler bot is an automated web crawler primarily used by the Chinese search engine iAsk to index web content. engine’s database. TODO Allow or Disallow
User-agent: iaskspider/2.0
Disallow: /
# ICC-Crawler is NICT's research crawler that automatically collects web pages from the Internet for academic research at Japan's National Institute of Information and Communications Technology. TODO Allow or Disallow
User-agent: ICC-Crawler
Disallow: /
# ImagesiftBot is an intelligence gatherer operated by Hive. TODO Allow or Disallow
User-agent: ImagesiftBot
Disallow: /
# Img2dataset may be crawling your website to collect publicly available images for inclusion in datasets used in machine learning and computer vision projects. TODO Allow or Disallow
User-agent: img2dataset
Disallow: /
# PetalBot is the web crawler for Huawei's Petal Search engine, and its user agent is a string of text that identifies it to websites. TODO Allow or Disallow
User-agent: PetalBot
Disallow: /
# Scrapy is an open-source web crawling framework for Python, used to extract data from websites. It is widely used for web scraping and data mining tasks. TODO Allow or Disallow
User-agent: Scrapy
Disallow: /
# SemrushBot is a web crawler operated by Semrush, a company that provides online visibility management and content marketing SaaS platforms. TODO Allow or Disallow
User-agent: SemrushBot-OCOB
Disallow: /
# SemrushBot-SWA is a specialized crawler used by Semrush's SEO Writing Assistant tool to check if URLs are accessible and verify content availability for SEO analysis. TODO Allow or Disallow
User-agent: SemrushBot-SWA
Disallow: /
# The Sidetrade crawler bot is an automated tool used by Sidetrade, a company specializing in AI-driven financial solutions. TODO Allow or Disallow
User-agent: Sidetrade indexer bot
Disallow: /
# VelenPublicWebCrawler is a web crawler operated by Velen, a company that provides cybersecurity solutions. The crawler is used to collect data from publicly accessible websites for analysis and security purposes. TODO Allow or Disallow
User-agent: VelenPublicWebCrawler
Disallow: /
# DuckAssistBot is a web crawler for DuckDuckGo Search that crawls pages in real-time for our AI-assisted answers, which prominently cite their sources TODO Allow or Disallow
User-agent: DuckAssistBot/1.2
Disallow: /
# Crawler behind You.com’s AI search and browser assistant, indexing content for real-time answers. TODO Allow or Disallow
User-agent: YouBot
Disallow: /
# Mistral’s real-time citation fetcher for “Le Chat” assistant; respects robots.txt directives. TODO Allow or Disallow
User-agent: MistralAI-User
Disallow: /
# Primary PerplexityBot crawler that indexes sites to build the Perplexity AI search engine. TODO Allow or Disallow
User-agent: PerplexityBot
Disallow: /
# Loads a page only when a user clicks a Perplexity citation; treated as human-like traffic. TODO Allow or Disallow
User-agent: Perplexity-User
Disallow: /
Sitemap: https://www.tv4play.se/sitemap.xml