diff --git a/Rules b/Rules
index 2e37ed4..ee20e03 100644
--- a/Rules
+++ b/Rules
@@ -1,17 +1,28 @@
#!/usr/bin/env ruby
-BYPASS_FILES = %w(404.html crossdomain.xml humans.txt robots.txt) unless defined?(BYPASS_FILES)
+preprocess do
+ create_robots_txt
+ create_webmaster_tools_authentications
+ create_sitemap
+end
-BYPASS_FILES.each do |file|
- compile("/#{file.sub /\..+/, ''}/") do
- # don't filter bypass files
- end
+compile %r{^/(google|robots|assets)} do
end
compile %r{/_.+/$} do
# don't filter partials
end
+# Sitemap and htaccess do get filtered with erb, but get no layout.
+compile %r{^/(sitemap|htaccess)/$} do
+ filter :erb
+end
+
+compile '/sitemap/', :rep => 'gzip' do
+ filter :erb
+ filter :shellcmd, :cmd => 'gzip'
+end
+
compile '/css/*/' do
# filter :sass, syntax: :scss, load_paths: SASS_LOAD_PATHS
filter :sass, Compass.sass_engine_options
@@ -36,6 +47,13 @@ compile '/posts/*' do
filter :cache_buster
end
+compile %r{^/(404)/$} do
+ filter :haml, format: :html5, ugly: true
+
+ layout 'default'
+ filter :cache_buster
+end
+
compile '*' do
unless item.binary?
case item[:extension]
@@ -55,16 +73,19 @@ compile '*' do
end
end
-BYPASS_FILES.each do |file|
- route("/#{file.sub /\..+/, ''}/") do
- "/#{file}" # route bypass files as is
- end
-end
-
route %r{/_.+/$} do
nil # don't route partials
end
+route %r{^/(assets/.*|sitemap|robots|atom)/$} do
+ ext = item[:extension]
+ ext = 'js' if ext == 'coffee'
+ ext = 'css' if ext == 'scss'
+
+ fp = cachebust?(item) ? fingerprint(item[:filename]) : ''
+ item.identifier.chop + fp + '.' + ext
+end
+
route '/css/*/' do
fp = fingerprint(item[:filename])
item.identifier.chop + fp + '.css'
@@ -79,6 +100,22 @@ route '/rss/' do
'/rss.xml'
end
+route '/htaccess/' do
+ '/.htaccess'
+end
+
+route '/sitemap/', :rep => 'gzip' do
+ '/sitemap.xml.gz'
+end
+
+route '/sitemap/' do
+ '/sitemap.xml'
+end
+
+route %r{^/(404)/$} do
+ item.identifier.chop + '.html'
+end
+
route '/posts/*' do
y, m, d, slug = /([0-9]+)\-([0-9]+)\-([0-9]+)\-([^\/]+)/.match(item.identifier).captures
"/#{y}/#{m}/#{d}/#{slug}/index.html"
diff --git a/content/error.haml b/content/404.haml
similarity index 100%
rename from content/error.haml
rename to content/404.haml
diff --git a/content/htaccess.txt b/content/htaccess.txt
new file mode 100644
index 0000000..6fd908e
--- /dev/null
+++ b/content/htaccess.txt
@@ -0,0 +1,126 @@
+# ----------------------------------------------------------------------
+# Start rewrite engine
+# ----------------------------------------------------------------------
+
+# Turning on the rewrite engine is necessary for the following rules and features.
+
+
+ RewriteEngine On
+
+
+# ----------------------------------------------------------------------
+# Suppress or force the "www." at the beginning of URLs
+# ----------------------------------------------------------------------
+
+# The same content should never be available under two different URLs - especially not with and
+# without "www." at the beginning, since this can cause SEO problems (duplicate content).
+# That's why you should choose one of the alternatives and redirect the other one.
+
+# By default option 1 (no "www.") is activated. Remember: Shorter URLs are sexier.
+# no-www.org/faq.php?q=class_b
+
+# If you rather want to use option 2, just comment out all option 1 lines
+# and uncomment option 2.
+# IMPORTANT: NEVER USE BOTH RULES AT THE SAME TIME!
+
+# ----------------------------------------------------------------------
+
+
+ RewriteCond %{HTTPS} !=on
+ RewriteCond %{HTTP_HOST} ^www\.(.+)$ [NC]
+ RewriteRule ^(.*)$ http://%1/$1 [R=301,L]
+
+
+
+
+# ----------------------------------------------------------------------
+# Add/remove trailing slash to (non-file) URLs
+# ----------------------------------------------------------------------
+
+# Google treats URLs with and without trailing slashes separately.
+# Forcing a trailing slash is usually preferred, but all that's really
+# important is that one correctly redirects to the other.
+
+# By default option 1 (force trailing slash) is activated.
+# http://googlewebmastercentral.blogspot.com/2010/04/to-slash-or-not-to-slash.html
+# http://www.alistapart.com/articles/slashforward/
+# http://httpd.apache.org/docs/2.0/misc/rewriteguide.html#url Trailing Slash Problem
+
+# ----------------------------------------------------------------------
+
+
+ RewriteCond %{REQUEST_FILENAME} !-f
+ RewriteCond %{REQUEST_URI} !(\.[a-zA-Z0-9]{1,5}|/|#(.*))$
+ RewriteRule ^(.*)$ /$1/ [R=301,L]
+
+
+# ----------------------------------------------------------------------
+
+# Option 2:
+# Rewrite "domain.com/foo/ -> domain.com/foo"
+
+#
+# RewriteRule ^(.*)/$ /$1 [R=301,L]
+#
+
+
+
+# ----------------------------------------------------------------------
+# Prevent 404 errors for non-existing redirected folders
+# ----------------------------------------------------------------------
+
+# without -MultiViews, Apache will give a 404 for a rewrite if a folder of the same name does not exist
+# e.g. /blog/hello : webmasterworld.com/apache/3808792.htm
+
+Options -MultiViews
+
+
+
+# ----------------------------------------------------------------------
+# custom 404 page
+# ----------------------------------------------------------------------
+
+# You can add custom pages to handle 500 or 403 pretty easily, if you like.
+ErrorDocument 404 /404.html
+
+
+
+# ----------------------------------------------------------------------
+# UTF-8 encoding
+# ----------------------------------------------------------------------
+
+# use utf-8 encoding for anything served text/plain or text/html
+AddDefaultCharset utf-8
+
+# force utf-8 for a number of file formats
+AddCharset utf-8 .html .css .js .xml .json .rss
+
+
+
+# ----------------------------------------------------------------------
+# A little more security
+# ----------------------------------------------------------------------
+
+
+# Do we want to advertise the exact version number of Apache we're running?
+# Probably not.
+## This can only be enabled if used in httpd.conf - It will not work in .htaccess
+ServerTokens Prod
+
+
+# "-Indexes" will have Apache block users from browsing folders without a default document
+# Usually you should leave this activated, because you shouldn't allow everybody to surf through
+# every folder on your server (which includes rather private places like CMS system folders).
+Options -Indexes
+
+
+# Block access to "hidden" directories whose names begin with a period. This
+# includes directories used by version control systems such as Subversion or Git.
+
+ RewriteRule "(^|/)\." - [F]
+
+
+<% if @site.config[:redirects] %>
+# Set up URL redirects<% @site.config[:redirects].each do |h| %>
+Redirect 301 <%= h[:from] %> <%= h[:to] %>
+<% end %><% end %>
\ No newline at end of file
diff --git a/lib/default.rb b/lib/default.rb
index 74f1b66..de8f567 100644
--- a/lib/default.rb
+++ b/lib/default.rb
@@ -15,6 +15,7 @@ unless defined? LOADED_DEFAULT_CONFIG
include Nanoc3::Helpers::HTMLEscape
include Nanoc3::Helpers::Rendering
include Nanoc3::Helpers::LinkTo
+ include Nanoc3::Helpers::XMLSitemap
# cachebuster
require 'nanoc/cachebuster'
diff --git a/lib/preprocessors.rb b/lib/preprocessors.rb
new file mode 100644
index 0000000..3d40b51
--- /dev/null
+++ b/lib/preprocessors.rb
@@ -0,0 +1,92 @@
+# Preprocessor helpers
+#
+# This file has a collection of methods that are meant to be used in the
+# preprocess-block in the Nanoc Rules file.
+#
+# @author Arjan van der Gaag
+
+
+# Generate a sitemap.xml file using Nanoc's own xml_sitemap helper method by
+# dynamically adding a new item.
+#
+# Make items that should not appear in the sitemap hidden. This by default
+# works on all image files and typical assets, as well as error pages and
+# htaccess. The is_hidden attribute is only explicitly set if it is absent,
+# allowing per-file overriding.
+#
+# @todo extract hidden file types into configuration file?
+def create_sitemap
+ return unless @site.config[:output_generated_assets]
+
+ @items.each do |item|
+ if %w{png gif jpg jpeg coffee scss sass less css xml js txt ico}.include?(item[:extension]) ||
+ item.identifier =~ /404|500|htaccess/
+ item[:is_hidden] = true unless item.attributes.has_key?(:is_hidden)
+ end
+ end
+ @items << Nanoc3::Item.new(
+ "<%= xml_sitemap %>",
+ { :extension => 'xml', :is_hidden => true },
+ '/sitemap/'
+ )
+end
+
+# Use special settings from the site configuration to generate the files
+# necessary for various webmaster tools authentications, such as the services
+# from Google, Yahoo and Bing.
+#
+# This loops through all the items in the `webmaster_tools` setting, using
+# its properties to generate a new item.
+#
+# See config.yaml for more documentation on the input format.
+def create_webmaster_tools_authentications
+ return unless @site.config[:output_generated_assets]
+
+ @site.config[:webmaster_tools].each do |file|
+ next if file[:identifier].nil?
+ content = file.delete(:content)
+ identifier = file.delete(:identifier)
+ file.merge({ :is_hidden => true })
+ @items << Nanoc3::Item.new(
+ content,
+ file,
+ identifier
+ )
+ end
+end
+
+# Generate a robots.txt file in the root of the site by dynamically creating
+# a new item.
+#
+# This will either output a default robots.txt file, that disallows all
+# assets except images, and points to the sitemap file.
+#
+# You can override the contents of the output of this method using the site
+# configuration, specifying Allow and Disallow directives. See the config.yaml
+# file for more information on the expected input format.
+def create_robots_txt
+ return unless @site.config[:output_generated_assets]
+
+ if @site.config[:robots]
+ content = if @site.config[:robots][:default]
+ <<-EOS
+User-agent: *
+Disallow: /assets
+Allow: /assets/images
+Sitemap: #{@site.config[:base_url]}/sitemap.xml
+ EOS
+ else
+ [
+ 'User-Agent: *',
+ @site.config[:robots][:disallow].map { |l| "Disallow: #{l}" },
+ (@site.config[:robots][:allow] || []).map { |l| "Allow: #{l}" },
+ "Sitemap: #{@site.config[:robots][:sitemap]}"
+ ].flatten.compact.join("\n")
+ end
+ @items << Nanoc3::Item.new(
+ content,
+ { :extension => 'txt', :is_hidden => true },
+ '/robots/'
+ )
+ end
+end
\ No newline at end of file
diff --git a/lib/shellcmd_filter.rb b/lib/shellcmd_filter.rb
new file mode 100644
index 0000000..4862fbd
--- /dev/null
+++ b/lib/shellcmd_filter.rb
@@ -0,0 +1,31 @@
+require 'open3'
+
+# This nanoc filter is a general purpose filter that simply pipes
+# the contents of an item into a given shell command, and sets
+# the items output to the output of it.
+#
+# It is NOT safe to use on large inputs, which will cause I/O
+# deadlocks. Any safer implementation is encouraged.
+#
+# Usage:
+#
+# compile '/static/js/*/' do
+# # minify JS :)
+# filter :shellcmd, "java -jar js-compiler.jar"
+# end
+#
+# Written by Vincent Driessen (http://twitter.com/nvie) and
+# released to the public domain.
+#
+# http://nvie.com
+class ShellCmdFilter < Nanoc3::Filter
+ identifier :shellcmd
+
+ def run(content, params={ :cmd => "sed s/foo/bar/" })
+ Open3.popen3(params[:cmd]) do |stdin, stdout, stderr|
+ stdin.write(content)
+ stdin.close()
+ stdout.read()
+ end
+ end
+end
\ No newline at end of file
diff --git a/nanoc.yaml b/nanoc.yaml
index 1f22d75..b95a173 100644
--- a/nanoc.yaml
+++ b/nanoc.yaml
@@ -24,6 +24,43 @@ title: 'ariejan.net'
author_name: 'Ariejan de Vroom'
author_uri: 'http://ariejan.net'
+
+# Configure the robots.txt file for this site.
+# Setting 'default' to true-ish will use sensible defaults. If you
+# wish to customize it, you can list paths to allow and to disallow.
+# Finally, you could manually set the path to the sitemap file.
+#
+# You can customize the robots file fairly well like this, but you
+# can always manually create a content file with the exact contents
+# you need.
+robots:
+ default: true # disallow assets, allow assets/images and point at sitemap
+ # disallow:
+ # - '/tag'
+ # - '/newsletter'
+ # allow:
+ # - '/tag/foo'
+ # sitemap: '/site-map.txt'
+
+# Set up authentication files for various webmaster tools (or something
+# similar). This simply creates a plain text file when generating the site.
+#
+# identifier: identifier of the output file, e.g. '/google12345/'
+# content: content of the file, e.g. 'aoa8202ns001'
+# extension: extension of the output file, e.g. 'html' or 'xml'
+webmaster_tools:
+ -
+ identifier:
+ content:
+ extension:
+
+# Should Nanoc generate some standard asset files for you, or skip them
+# completely?
+#
+# This currently controls the generation of robots.txt, sitemap.xml,
+# sitemap.xml.gz and webmaster tools authentication files.
+output_generated_assets: true
+
prune:
# Whether to automatically remove files not managed by nanoc from the output
# directory. For safety reasons, this is turned off by default.