diff --git a/Rules b/Rules index 2e37ed4..ee20e03 100644 --- a/Rules +++ b/Rules @@ -1,17 +1,28 @@ #!/usr/bin/env ruby -BYPASS_FILES = %w(404.html crossdomain.xml humans.txt robots.txt) unless defined?(BYPASS_FILES) +preprocess do + create_robots_txt + create_webmaster_tools_authentications + create_sitemap +end -BYPASS_FILES.each do |file| - compile("/#{file.sub /\..+/, ''}/") do - # don't filter bypass files - end +compile %r{^/(google|robots|assets)} do end compile %r{/_.+/$} do # don't filter partials end +# Sitemap and htaccess do get filtered with erb, but get no layout. +compile %r{^/(sitemap|htaccess)/$} do + filter :erb +end + +compile '/sitemap/', :rep => 'gzip' do + filter :erb + filter :shellcmd, :cmd => 'gzip' +end + compile '/css/*/' do # filter :sass, syntax: :scss, load_paths: SASS_LOAD_PATHS filter :sass, Compass.sass_engine_options @@ -36,6 +47,13 @@ compile '/posts/*' do filter :cache_buster end +compile %r{^/(404)/$} do + filter :haml, format: :html5, ugly: true + + layout 'default' + filter :cache_buster +end + compile '*' do unless item.binary? case item[:extension] @@ -55,16 +73,19 @@ compile '*' do end end -BYPASS_FILES.each do |file| - route("/#{file.sub /\..+/, ''}/") do - "/#{file}" # route bypass files as is - end -end - route %r{/_.+/$} do nil # don't route partials end +route %r{^/(assets/.*|sitemap|robots|atom)/$} do + ext = item[:extension] + ext = 'js' if ext == 'coffee' + ext = 'css' if ext == 'scss' + + fp = cachebust?(item) ? fingerprint(item[:filename]) : '' + item.identifier.chop + fp + '.' + ext +end + route '/css/*/' do fp = fingerprint(item[:filename]) item.identifier.chop + fp + '.css' @@ -79,6 +100,22 @@ route '/rss/' do '/rss.xml' end +route '/htaccess/' do + '/.htaccess' +end + +route '/sitemap/', :rep => 'gzip' do + '/sitemap.xml.gz' +end + +route '/sitemap/' do + '/sitemap.xml' +end + +route %r{^/(404)/$} do + item.identifier.chop + '.html' +end + route '/posts/*' do y, m, d, slug = /([0-9]+)\-([0-9]+)\-([0-9]+)\-([^\/]+)/.match(item.identifier).captures "/#{y}/#{m}/#{d}/#{slug}/index.html" diff --git a/content/error.haml b/content/404.haml similarity index 100% rename from content/error.haml rename to content/404.haml diff --git a/content/htaccess.txt b/content/htaccess.txt new file mode 100644 index 0000000..6fd908e --- /dev/null +++ b/content/htaccess.txt @@ -0,0 +1,126 @@ +# ---------------------------------------------------------------------- +# Start rewrite engine +# ---------------------------------------------------------------------- + +# Turning on the rewrite engine is necessary for the following rules and features. + + + RewriteEngine On + + +# ---------------------------------------------------------------------- +# Suppress or force the "www." at the beginning of URLs +# ---------------------------------------------------------------------- + +# The same content should never be available under two different URLs - especially not with and +# without "www." at the beginning, since this can cause SEO problems (duplicate content). +# That's why you should choose one of the alternatives and redirect the other one. + +# By default option 1 (no "www.") is activated. Remember: Shorter URLs are sexier. +# no-www.org/faq.php?q=class_b + +# If you rather want to use option 2, just comment out all option 1 lines +# and uncomment option 2. +# IMPORTANT: NEVER USE BOTH RULES AT THE SAME TIME! + +# ---------------------------------------------------------------------- + + + RewriteCond %{HTTPS} !=on + RewriteCond %{HTTP_HOST} ^www\.(.+)$ [NC] + RewriteRule ^(.*)$ http://%1/$1 [R=301,L] + + + + +# ---------------------------------------------------------------------- +# Add/remove trailing slash to (non-file) URLs +# ---------------------------------------------------------------------- + +# Google treats URLs with and without trailing slashes separately. +# Forcing a trailing slash is usually preferred, but all that's really +# important is that one correctly redirects to the other. + +# By default option 1 (force trailing slash) is activated. +# http://googlewebmastercentral.blogspot.com/2010/04/to-slash-or-not-to-slash.html +# http://www.alistapart.com/articles/slashforward/ +# http://httpd.apache.org/docs/2.0/misc/rewriteguide.html#url Trailing Slash Problem + +# ---------------------------------------------------------------------- + + + RewriteCond %{REQUEST_FILENAME} !-f + RewriteCond %{REQUEST_URI} !(\.[a-zA-Z0-9]{1,5}|/|#(.*))$ + RewriteRule ^(.*)$ /$1/ [R=301,L] + + +# ---------------------------------------------------------------------- + +# Option 2: +# Rewrite "domain.com/foo/ -> domain.com/foo" + +# +# RewriteRule ^(.*)/$ /$1 [R=301,L] +# + + + +# ---------------------------------------------------------------------- +# Prevent 404 errors for non-existing redirected folders +# ---------------------------------------------------------------------- + +# without -MultiViews, Apache will give a 404 for a rewrite if a folder of the same name does not exist +# e.g. /blog/hello : webmasterworld.com/apache/3808792.htm + +Options -MultiViews + + + +# ---------------------------------------------------------------------- +# custom 404 page +# ---------------------------------------------------------------------- + +# You can add custom pages to handle 500 or 403 pretty easily, if you like. +ErrorDocument 404 /404.html + + + +# ---------------------------------------------------------------------- +# UTF-8 encoding +# ---------------------------------------------------------------------- + +# use utf-8 encoding for anything served text/plain or text/html +AddDefaultCharset utf-8 + +# force utf-8 for a number of file formats +AddCharset utf-8 .html .css .js .xml .json .rss + + + +# ---------------------------------------------------------------------- +# A little more security +# ---------------------------------------------------------------------- + + +# Do we want to advertise the exact version number of Apache we're running? +# Probably not. +## This can only be enabled if used in httpd.conf - It will not work in .htaccess +ServerTokens Prod + + +# "-Indexes" will have Apache block users from browsing folders without a default document +# Usually you should leave this activated, because you shouldn't allow everybody to surf through +# every folder on your server (which includes rather private places like CMS system folders). +Options -Indexes + + +# Block access to "hidden" directories whose names begin with a period. This +# includes directories used by version control systems such as Subversion or Git. + + RewriteRule "(^|/)\." - [F] + + +<% if @site.config[:redirects] %> +# Set up URL redirects<% @site.config[:redirects].each do |h| %> +Redirect 301 <%= h[:from] %> <%= h[:to] %> +<% end %><% end %> \ No newline at end of file diff --git a/lib/default.rb b/lib/default.rb index 74f1b66..de8f567 100644 --- a/lib/default.rb +++ b/lib/default.rb @@ -15,6 +15,7 @@ unless defined? LOADED_DEFAULT_CONFIG include Nanoc3::Helpers::HTMLEscape include Nanoc3::Helpers::Rendering include Nanoc3::Helpers::LinkTo + include Nanoc3::Helpers::XMLSitemap # cachebuster require 'nanoc/cachebuster' diff --git a/lib/preprocessors.rb b/lib/preprocessors.rb new file mode 100644 index 0000000..3d40b51 --- /dev/null +++ b/lib/preprocessors.rb @@ -0,0 +1,92 @@ +# Preprocessor helpers +# +# This file has a collection of methods that are meant to be used in the +# preprocess-block in the Nanoc Rules file. +# +# @author Arjan van der Gaag + + +# Generate a sitemap.xml file using Nanoc's own xml_sitemap helper method by +# dynamically adding a new item. +# +# Make items that should not appear in the sitemap hidden. This by default +# works on all image files and typical assets, as well as error pages and +# htaccess. The is_hidden attribute is only explicitly set if it is absent, +# allowing per-file overriding. +# +# @todo extract hidden file types into configuration file? +def create_sitemap + return unless @site.config[:output_generated_assets] + + @items.each do |item| + if %w{png gif jpg jpeg coffee scss sass less css xml js txt ico}.include?(item[:extension]) || + item.identifier =~ /404|500|htaccess/ + item[:is_hidden] = true unless item.attributes.has_key?(:is_hidden) + end + end + @items << Nanoc3::Item.new( + "<%= xml_sitemap %>", + { :extension => 'xml', :is_hidden => true }, + '/sitemap/' + ) +end + +# Use special settings from the site configuration to generate the files +# necessary for various webmaster tools authentications, such as the services +# from Google, Yahoo and Bing. +# +# This loops through all the items in the `webmaster_tools` setting, using +# its properties to generate a new item. +# +# See config.yaml for more documentation on the input format. +def create_webmaster_tools_authentications + return unless @site.config[:output_generated_assets] + + @site.config[:webmaster_tools].each do |file| + next if file[:identifier].nil? + content = file.delete(:content) + identifier = file.delete(:identifier) + file.merge({ :is_hidden => true }) + @items << Nanoc3::Item.new( + content, + file, + identifier + ) + end +end + +# Generate a robots.txt file in the root of the site by dynamically creating +# a new item. +# +# This will either output a default robots.txt file, that disallows all +# assets except images, and points to the sitemap file. +# +# You can override the contents of the output of this method using the site +# configuration, specifying Allow and Disallow directives. See the config.yaml +# file for more information on the expected input format. +def create_robots_txt + return unless @site.config[:output_generated_assets] + + if @site.config[:robots] + content = if @site.config[:robots][:default] + <<-EOS +User-agent: * +Disallow: /assets +Allow: /assets/images +Sitemap: #{@site.config[:base_url]}/sitemap.xml + EOS + else + [ + 'User-Agent: *', + @site.config[:robots][:disallow].map { |l| "Disallow: #{l}" }, + (@site.config[:robots][:allow] || []).map { |l| "Allow: #{l}" }, + "Sitemap: #{@site.config[:robots][:sitemap]}" + ].flatten.compact.join("\n") + end + @items << Nanoc3::Item.new( + content, + { :extension => 'txt', :is_hidden => true }, + '/robots/' + ) + end +end \ No newline at end of file diff --git a/lib/shellcmd_filter.rb b/lib/shellcmd_filter.rb new file mode 100644 index 0000000..4862fbd --- /dev/null +++ b/lib/shellcmd_filter.rb @@ -0,0 +1,31 @@ +require 'open3' + +# This nanoc filter is a general purpose filter that simply pipes +# the contents of an item into a given shell command, and sets +# the items output to the output of it. +# +# It is NOT safe to use on large inputs, which will cause I/O +# deadlocks. Any safer implementation is encouraged. +# +# Usage: +# +# compile '/static/js/*/' do +# # minify JS :) +# filter :shellcmd, "java -jar js-compiler.jar" +# end +# +# Written by Vincent Driessen (http://twitter.com/nvie) and +# released to the public domain. +# +# http://nvie.com +class ShellCmdFilter < Nanoc3::Filter + identifier :shellcmd + + def run(content, params={ :cmd => "sed s/foo/bar/" }) + Open3.popen3(params[:cmd]) do |stdin, stdout, stderr| + stdin.write(content) + stdin.close() + stdout.read() + end + end +end \ No newline at end of file diff --git a/nanoc.yaml b/nanoc.yaml index 1f22d75..b95a173 100644 --- a/nanoc.yaml +++ b/nanoc.yaml @@ -24,6 +24,43 @@ title: 'ariejan.net' author_name: 'Ariejan de Vroom' author_uri: 'http://ariejan.net' + +# Configure the robots.txt file for this site. +# Setting 'default' to true-ish will use sensible defaults. If you +# wish to customize it, you can list paths to allow and to disallow. +# Finally, you could manually set the path to the sitemap file. +# +# You can customize the robots file fairly well like this, but you +# can always manually create a content file with the exact contents +# you need. +robots: + default: true # disallow assets, allow assets/images and point at sitemap + # disallow: + # - '/tag' + # - '/newsletter' + # allow: + # - '/tag/foo' + # sitemap: '/site-map.txt' + +# Set up authentication files for various webmaster tools (or something +# similar). This simply creates a plain text file when generating the site. +# +# identifier: identifier of the output file, e.g. '/google12345/' +# content: content of the file, e.g. 'aoa8202ns001' +# extension: extension of the output file, e.g. 'html' or 'xml' +webmaster_tools: + - + identifier: + content: + extension: + +# Should Nanoc generate some standard asset files for you, or skip them +# completely? +# +# This currently controls the generation of robots.txt, sitemap.xml, +# sitemap.xml.gz and webmaster tools authentication files. +output_generated_assets: true + prune: # Whether to automatically remove files not managed by nanoc from the output # directory. For safety reasons, this is turned off by default.