Better server files like robots, 404, etc.

2013-03-24 22:27:56 +01:00 · 2013-03-24 22:27:56 +01:00 · 528b0ae80e
commit 528b0ae80e
parent 4ce61f8b40
7 changed files with 335 additions and 11 deletions
--- a/59
+++ b/59
@ -1,17 +1,28 @@
 #!/usr/bin/env ruby

-BYPASS_FILES = %w(404.html crossdomain.xml humans.txt robots.txt) unless defined?(BYPASS_FILES)
+preprocess do
+  create_robots_txt
+  create_webmaster_tools_authentications
+  create_sitemap
+end

-BYPASS_FILES.each do |file|
-  compile("/#{file.sub /\..+/, ''}/") do
-    # don't filter bypass files
-  end
+compile %r{^/(google|robots|assets)} do
 end

 compile %r{/_.+/$} do
  # don't filter partials
 end

+# Sitemap and htaccess do get filtered with erb, but get no layout.
+compile %r{^/(sitemap|htaccess)/$} do
+  filter :erb
+end
+
+compile '/sitemap/', :rep => 'gzip' do
+  filter :erb
+  filter :shellcmd, :cmd => 'gzip'
+end
+
 compile '/css/*/' do
  # filter :sass, syntax: :scss, load_paths: SASS_LOAD_PATHS
  filter :sass, Compass.sass_engine_options
@ -36,6 +47,13 @@ compile '/posts/*' do
  filter :cache_buster
 end

+compile %r{^/(404)/$} do
+  filter :haml, format: :html5, ugly: true
+
+  layout 'default'
+  filter :cache_buster
+end
+
 compile '*' do
  unless item.binary?
    case item[:extension]
@ -55,16 +73,19 @@ compile '*' do
  end
 end

-BYPASS_FILES.each do |file|
-  route("/#{file.sub /\..+/, ''}/") do
-    "/#{file}" # route bypass files as is
-  end
-end
-
 route %r{/_.+/$} do
  nil # don't route partials
 end

+route %r{^/(assets/.*|sitemap|robots|atom)/$} do
+  ext = item[:extension]
+  ext = 'js' if ext == 'coffee'
+  ext = 'css' if ext == 'scss'
+
+  fp = cachebust?(item) ? fingerprint(item[:filename]) : ''
+  item.identifier.chop + fp + '.' + ext
+end
+
 route '/css/*/' do
  fp = fingerprint(item[:filename])
  item.identifier.chop + fp + '.css'
@ -79,6 +100,22 @@ route '/rss/' do
  '/rss.xml'
 end

+route '/htaccess/' do
+  '/.htaccess'
+end
+
+route '/sitemap/', :rep => 'gzip' do
+  '/sitemap.xml.gz'
+end
+
+route '/sitemap/' do
+  '/sitemap.xml'
+end
+
+route %r{^/(404)/$} do
+  item.identifier.chop + '.html'
+end
+
 route '/posts/*' do
  y, m, d, slug = /([0-9]+)\-([0-9]+)\-([0-9]+)\-([^\/]+)/.match(item.identifier).captures
  "/#{y}/#{m}/#{d}/#{slug}/index.html"
--- a/content/error.haml
+++ b/content/error.haml
--- a/content/htaccess.txt
+++ b/content/htaccess.txt
@ -0,0 +1,126 @@
+# ----------------------------------------------------------------------
+# Start rewrite engine
+# ----------------------------------------------------------------------
+
+# Turning on the rewrite engine is necessary for the following rules and features.
+
+<IfModule mod_rewrite.c>
+  RewriteEngine On
+</IfModule>
+
+# ----------------------------------------------------------------------
+# Suppress or force the "www." at the beginning of URLs
+# ----------------------------------------------------------------------
+
+# The same content should never be available under two different URLs - especially not with and
+# without "www." at the beginning, since this can cause SEO problems (duplicate content).
+# That's why you should choose one of the alternatives and redirect the other one.
+
+# By default option 1 (no "www.") is activated. Remember: Shorter URLs are sexier.
+# no-www.org/faq.php?q=class_b
+
+# If you rather want to use option 2, just comment out all option 1 lines
+# and uncomment option 2.
+# IMPORTANT: NEVER USE BOTH RULES AT THE SAME TIME!
+
+# ----------------------------------------------------------------------
+
+<IfModule mod_rewrite.c>
+  RewriteCond %{HTTPS} !=on
+  RewriteCond %{HTTP_HOST} ^www\.(.+)$ [NC]
+  RewriteRule ^(.*)$ http://%1/$1 [R=301,L]
+</IfModule>
+
+
+
+# ----------------------------------------------------------------------
+# Add/remove trailing slash to (non-file) URLs
+# ----------------------------------------------------------------------
+
+# Google treats URLs with and without trailing slashes separately.
+# Forcing a trailing slash is usually preferred, but all that's really
+# important is that one correctly redirects to the other.
+
+# By default option 1 (force trailing slash) is activated.
+# http://googlewebmastercentral.blogspot.com/2010/04/to-slash-or-not-to-slash.html
+# http://www.alistapart.com/articles/slashforward/
+# http://httpd.apache.org/docs/2.0/misc/rewriteguide.html#url Trailing Slash Problem
+
+# ----------------------------------------------------------------------
+
+<IfModule mod_rewrite.c>
+  RewriteCond %{REQUEST_FILENAME} !-f
+  RewriteCond %{REQUEST_URI} !(\.[a-zA-Z0-9]{1,5}|/|#(.*))$
+  RewriteRule ^(.*)$ /$1/ [R=301,L]
+</IfModule>
+
+# ----------------------------------------------------------------------
+
+# Option 2:
+# Rewrite "domain.com/foo/ -> domain.com/foo"
+
+#<IfModule mod_rewrite.c>
+#  RewriteRule ^(.*)/$ /$1 [R=301,L]
+#</IfModule>
+
+
+
+# ----------------------------------------------------------------------
+# Prevent 404 errors for non-existing redirected folders
+# ----------------------------------------------------------------------
+
+# without -MultiViews, Apache will give a 404 for a rewrite if a folder of the same name does not exist
+#   e.g. /blog/hello : webmasterworld.com/apache/3808792.htm
+
+Options -MultiViews
+
+
+
+# ----------------------------------------------------------------------
+# custom 404 page
+# ----------------------------------------------------------------------
+
+# You can add custom pages to handle 500 or 403 pretty easily, if you like.
+ErrorDocument 404 /404.html
+
+
+
+# ----------------------------------------------------------------------
+# UTF-8 encoding
+# ----------------------------------------------------------------------
+
+# use utf-8 encoding for anything served text/plain or text/html
+AddDefaultCharset utf-8
+
+# force utf-8 for a number of file formats
+AddCharset utf-8 .html .css .js .xml .json .rss
+
+
+
+# ----------------------------------------------------------------------
+# A little more security
+# ----------------------------------------------------------------------
+
+
+# Do we want to advertise the exact version number of Apache we're running?
+# Probably not.
+## This can only be enabled if used in httpd.conf - It will not work in .htaccess
+ServerTokens Prod
+
+
+# "-Indexes" will have Apache block users from browsing folders without a default document
+# Usually you should leave this activated, because you shouldn't allow everybody to surf through
+# every folder on your server (which includes rather private places like CMS system folders).
+Options -Indexes
+
+
+# Block access to "hidden" directories whose names begin with a period. This
+# includes directories used by version control systems such as Subversion or Git.
+<IfModule mod_rewrite.c>
+  RewriteRule "(^|/)\." - [F]
+</IfModule>
+
+<% if @site.config[:redirects] %>
+# Set up URL redirects<% @site.config[:redirects].each do |h| %>
+Redirect 301 <%= h[:from] %> <%= h[:to] %>
+<% end %><% end %>
--- a/lib/default.rb
+++ b/lib/default.rb
@ -15,6 +15,7 @@ unless defined? LOADED_DEFAULT_CONFIG
  include Nanoc3::Helpers::HTMLEscape
  include Nanoc3::Helpers::Rendering
  include Nanoc3::Helpers::LinkTo
+  include Nanoc3::Helpers::XMLSitemap

  # cachebuster
  require 'nanoc/cachebuster'
--- a/lib/preprocessors.rb
+++ b/lib/preprocessors.rb
@ -0,0 +1,92 @@
+# Preprocessor helpers
+#
+# This file has a collection of methods that are meant to be used in the
+# preprocess-block in the Nanoc Rules file.
+#
+# @author Arjan van der Gaag
+
+
+# Generate a sitemap.xml file using Nanoc's own xml_sitemap helper method by
+# dynamically adding a new item.
+#
+# Make items that should not appear in the sitemap hidden. This by default
+# works on all image files and typical assets, as well as error pages and
+# htaccess. The is_hidden attribute is only explicitly set if it is absent,
+# allowing per-file overriding.
+#
+# @todo extract hidden file types into configuration file?
+def create_sitemap
+  return unless @site.config[:output_generated_assets]
+
+  @items.each do |item|
+    if %w{png gif jpg jpeg coffee scss sass less css xml js txt ico}.include?(item[:extension]) ||
+        item.identifier =~ /404|500|htaccess/
+      item[:is_hidden] = true unless item.attributes.has_key?(:is_hidden)
+    end
+  end
+  @items << Nanoc3::Item.new(
+    "<%= xml_sitemap %>",
+    { :extension => 'xml', :is_hidden => true },
+    '/sitemap/'
+  )
+end
+
+# Use special settings from the site configuration to generate the files
+# necessary for various webmaster tools authentications, such as the services
+# from Google, Yahoo and Bing.
+#
+# This loops through all the items in the `webmaster_tools` setting, using
+# its properties to generate a new item.
+#
+# See config.yaml for more documentation on the input format.
+def create_webmaster_tools_authentications
+  return unless @site.config[:output_generated_assets]
+
+  @site.config[:webmaster_tools].each do |file|
+    next if file[:identifier].nil?
+    content    = file.delete(:content)
+    identifier = file.delete(:identifier)
+    file.merge({ :is_hidden => true })
+    @items << Nanoc3::Item.new(
+      content,
+      file,
+      identifier
+    )
+  end
+end
+
+# Generate a robots.txt file in the root of the site by dynamically creating
+# a new item.
+#
+# This will either output a default robots.txt file, that disallows all
+# assets except images, and points to the sitemap file.
+#
+# You can override the contents of the output of this method using the site
+# configuration, specifying Allow and Disallow directives. See the config.yaml
+# file for more information on the expected input format.
+def create_robots_txt
+  return unless @site.config[:output_generated_assets]
+
+  if @site.config[:robots]
+    content = if @site.config[:robots][:default]
+      <<-EOS
+User-agent: *
+Disallow: /assets
+Allow: /assets/images
+Sitemap: #{@site.config[:base_url]}/sitemap.xml
+      EOS
+    else
+      [
+        'User-Agent: *',
+        @site.config[:robots][:disallow].map { |l| "Disallow: #{l}" },
+        (@site.config[:robots][:allow] || []).map { |l| "Allow: #{l}" },
+        "Sitemap: #{@site.config[:robots][:sitemap]}"
+      ].flatten.compact.join("\n")
+    end
+    @items << Nanoc3::Item.new(
+      content,
+      { :extension => 'txt', :is_hidden => true },
+      '/robots/'
+    )
+  end
+end
--- a/lib/shellcmd_filter.rb
+++ b/lib/shellcmd_filter.rb
@ -0,0 +1,31 @@
+require 'open3'
+
+# This nanoc filter is a general purpose filter that simply pipes
+# the contents of an item into a given shell command, and sets
+# the items output to the output of it.
+#
+# It is NOT safe to use on large inputs, which will cause I/O
+# deadlocks.  Any safer implementation is encouraged.
+#
+# Usage:
+#
+#   compile '/static/js/*/' do
+#     # minify JS :)
+#     filter :shellcmd, "java -jar js-compiler.jar"
+#   end
+#
+# Written by Vincent Driessen (http://twitter.com/nvie) and
+# released to the public domain.
+#
+# http://nvie.com
+class ShellCmdFilter < Nanoc3::Filter
+  identifier :shellcmd
+
+  def run(content, params={ :cmd => "sed s/foo/bar/" })
+    Open3.popen3(params[:cmd]) do |stdin, stdout, stderr|
+      stdin.write(content)
+      stdin.close()
+      stdout.read()
+    end
+  end
+end
--- a/nanoc.yaml
+++ b/nanoc.yaml
@ -24,6 +24,43 @@ title: 'ariejan.net'
 author_name: 'Ariejan de Vroom'
 author_uri: 'http://ariejan.net'

+
+# Configure the robots.txt file for this site.
+# Setting 'default' to true-ish will use sensible defaults. If you
+# wish to customize it, you can list paths to allow and to disallow.
+# Finally, you could manually set the path to the sitemap file.
+#
+# You can customize the robots file fairly well like this, but you
+# can always manually create a content file with the exact contents
+# you need.
+robots:
+  default: true # disallow assets, allow assets/images and point at sitemap
+  # disallow:
+  #   - '/tag'
+  #   - '/newsletter'
+  # allow:
+  #   - '/tag/foo'
+  # sitemap: '/site-map.txt'
+
+# Set up authentication files for various webmaster tools (or something
+# similar). This simply creates a plain text file when generating the site.
+#
+# identifier: identifier of the output file, e.g. '/google12345/'
+# content: content of the file, e.g. 'aoa8202ns001'
+# extension: extension of the output file, e.g. 'html' or 'xml'
+webmaster_tools:
+  -
+    identifier:
+    content:
+    extension:
+
+# Should Nanoc generate some standard asset files for you, or skip them
+# completely?
+#
+# This currently controls the generation of robots.txt, sitemap.xml,
+# sitemap.xml.gz and webmaster tools authentication files.
+output_generated_assets: true
+
 prune:
  # Whether to automatically remove files not managed by nanoc from the output
  # directory. For safety reasons, this is turned off by default.