robots.txt 4.33 KB
# The FULL URL to the DSpace sitemaps
# The ${dspace.url} will be auto-filled with the value in dspace.cfg
# XML sitemap is listed first as it is preferred by most search engines
Sitemap: ${dspace.url}/sitemap
Sitemap: ${dspace.url}/htmlmap

##########################
# Default Access Group
# (NOTE: blank lines are not allowable in a group record)
##########################
User-agent: *
# Disable access to Discovery search and filters
Disallow: /discover
Disallow: /handle/${handle.prefix}/*/discover 
#Disallow: /search-filter
#Disallow: /handle/${handle.prefix}/*/search-filter
Disallow: /simple-search
Disallow: /request-item
#
# Optionally uncomment the following line ONLY if sitemaps are working
# and you have verified that your site is being indexed correctly.
# Disallow: /browse
#
# If you have configured DSpace (Solr-based) Statistics to be publicly 
# accessible, then you may not want this content to be indexed
# Disallow: /statistics
#
# You also may wish to disallow access to the following paths, in order
# to stop web spiders from accessing user-based content
# Disallow: /contact
# Disallow: /feedback
# Disallow: /forgot
# Disallow: /login
# Disallow: /register


##############################
# Section for misbehaving bots
# The following directives to block specific robots were borrowed from Wikipedia's robots.txt
##############################

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: Zealbot
Disallow: /

User-agent: MSIECrawler
Disallow: /

User-agent: SiteSnagger
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: Fetch
Disallow: /

User-agent: Offline Explorer
Disallow: /

User-agent: Teleport
Disallow: /

User-agent: TeleportPro
Disallow: /

User-agent: WebZIP
Disallow: /

User-agent: linko
Disallow: /

User-agent: HTTrack
Disallow: /

User-agent: Microsoft.URL.Control
Disallow: /

User-agent: Xenu
Disallow: /

User-agent: larbin
Disallow: /

User-agent: libwww
Disallow: /

User-agent: ZyBORG
Disallow: /

User-agent: Download Ninja
Disallow: /

# Misbehaving: requests much too fast:
User-agent: fast
Disallow: /

#
# If your DSpace is going down because of someone using recursive wget, 
# you can activate the following rule.
#
# If your own faculty is bringing down your dspace with recursive wget,
# you can advise them to use the --wait option to set the delay between hits.
#
#User-agent: wget
#Disallow: /

#
# The 'grub' distributed client has been *very* poorly behaved.
#
User-agent: grub-client
Disallow: /

#
# Doesn't follow robots.txt anyway, but...
#
User-agent: k2spider
Disallow: /

#
# Hits many times per second, not acceptable
# http://www.nameprotect.com/botinfo.html
User-agent: NPBot
Disallow: /

# A capture bot, downloads gazillions of pages with no public benefit
# http://www.webreaper.net/
User-agent: WebReaper
Disallow: /


# RCAAP: remove other crawlers
User-agent: adbeat_bot
Disallow: /
User-agent: AhrefsBot
Disallow: /
User-agent: AITCSRobot
Disallow: /
User-agent: Alexibot
Disallow: /
User-agent: Baiduspider
Disallow: /
User-agent: BLEXBot
Disallow: /
User-agent: Cliqzbot
Disallow: /
User-agent: DotBot
Disallow: /
User-agent: dotbot
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: expo9
Disallow: /
User-agent: Huaweisymantecspider
Disallow: /
User-agent: InfluenceBot
Disallow: /
User-agent: ltx71
Disallow: /
User-agent: ltx71 - (http://ltx71.com/)
Disallow: /
User-agent: MaxPointCrawler
Disallow: /
User-agent: MJ12bot
Disallow: /
User-agent: rogerbot
Disallow: /
User-agent: SemrushBot
Disallow: /
User-agent: SemrushBot-SA
Disallow: /
User-agent: seoscanners.net
Disallow: /
User-agent: SiteSnagger
Disallow: /
User-agent: SurveyBot
Disallow: /
User-agent: turnitinbot
Disallow: /
User-agent: WebReaper
Disallow: /
User-agent: Xaldon_WebSpider
Disallow: /
User-agent: Xenu’s
Disallow: /
User-agent: Xenu’s Link Sleuth 1.1c
Disallow: /
User-agent: YandexBot
Disallow: /

# RCAAP delay crallers
User-agent: bingbot
Crawl-delay: 10
User-agent: Yahoo
Crawl-delay: 10