Better server files like robots, 404, etc.

This commit is contained in:
Ariejan de Vroom 2013-03-24 22:27:56 +01:00
parent 4ce61f8b40
commit 528b0ae80e
7 changed files with 335 additions and 11 deletions

59
Rules
View File

@ -1,17 +1,28 @@
#!/usr/bin/env ruby
BYPASS_FILES = %w(404.html crossdomain.xml humans.txt robots.txt) unless defined?(BYPASS_FILES)
preprocess do
create_robots_txt
create_webmaster_tools_authentications
create_sitemap
end
BYPASS_FILES.each do |file|
compile("/#{file.sub /\..+/, ''}/") do
# don't filter bypass files
end
compile %r{^/(google|robots|assets)} do
end
compile %r{/_.+/$} do
# don't filter partials
end
# Sitemap and htaccess do get filtered with erb, but get no layout.
compile %r{^/(sitemap|htaccess)/$} do
filter :erb
end
compile '/sitemap/', :rep => 'gzip' do
filter :erb
filter :shellcmd, :cmd => 'gzip'
end
compile '/css/*/' do
# filter :sass, syntax: :scss, load_paths: SASS_LOAD_PATHS
filter :sass, Compass.sass_engine_options
@ -36,6 +47,13 @@ compile '/posts/*' do
filter :cache_buster
end
compile %r{^/(404)/$} do
filter :haml, format: :html5, ugly: true
layout 'default'
filter :cache_buster
end
compile '*' do
unless item.binary?
case item[:extension]
@ -55,16 +73,19 @@ compile '*' do
end
end
BYPASS_FILES.each do |file|
route("/#{file.sub /\..+/, ''}/") do
"/#{file}" # route bypass files as is
end
end
route %r{/_.+/$} do
nil # don't route partials
end
route %r{^/(assets/.*|sitemap|robots|atom)/$} do
ext = item[:extension]
ext = 'js' if ext == 'coffee'
ext = 'css' if ext == 'scss'
fp = cachebust?(item) ? fingerprint(item[:filename]) : ''
item.identifier.chop + fp + '.' + ext
end
route '/css/*/' do
fp = fingerprint(item[:filename])
item.identifier.chop + fp + '.css'
@ -79,6 +100,22 @@ route '/rss/' do
'/rss.xml'
end
route '/htaccess/' do
'/.htaccess'
end
route '/sitemap/', :rep => 'gzip' do
'/sitemap.xml.gz'
end
route '/sitemap/' do
'/sitemap.xml'
end
route %r{^/(404)/$} do
item.identifier.chop + '.html'
end
route '/posts/*' do
y, m, d, slug = /([0-9]+)\-([0-9]+)\-([0-9]+)\-([^\/]+)/.match(item.identifier).captures
"/#{y}/#{m}/#{d}/#{slug}/index.html"

126
content/htaccess.txt Normal file
View File

@ -0,0 +1,126 @@
# ----------------------------------------------------------------------
# Start rewrite engine
# ----------------------------------------------------------------------
# Turning on the rewrite engine is necessary for the following rules and features.
<IfModule mod_rewrite.c>
RewriteEngine On
</IfModule>
# ----------------------------------------------------------------------
# Suppress or force the "www." at the beginning of URLs
# ----------------------------------------------------------------------
# The same content should never be available under two different URLs - especially not with and
# without "www." at the beginning, since this can cause SEO problems (duplicate content).
# That's why you should choose one of the alternatives and redirect the other one.
# By default option 1 (no "www.") is activated. Remember: Shorter URLs are sexier.
# no-www.org/faq.php?q=class_b
# If you rather want to use option 2, just comment out all option 1 lines
# and uncomment option 2.
# IMPORTANT: NEVER USE BOTH RULES AT THE SAME TIME!
# ----------------------------------------------------------------------
<IfModule mod_rewrite.c>
RewriteCond %{HTTPS} !=on
RewriteCond %{HTTP_HOST} ^www\.(.+)$ [NC]
RewriteRule ^(.*)$ http://%1/$1 [R=301,L]
</IfModule>
# ----------------------------------------------------------------------
# Add/remove trailing slash to (non-file) URLs
# ----------------------------------------------------------------------
# Google treats URLs with and without trailing slashes separately.
# Forcing a trailing slash is usually preferred, but all that's really
# important is that one correctly redirects to the other.
# By default option 1 (force trailing slash) is activated.
# http://googlewebmastercentral.blogspot.com/2010/04/to-slash-or-not-to-slash.html
# http://www.alistapart.com/articles/slashforward/
# http://httpd.apache.org/docs/2.0/misc/rewriteguide.html#url Trailing Slash Problem
# ----------------------------------------------------------------------
<IfModule mod_rewrite.c>
RewriteCond %{REQUEST_FILENAME} !-f
RewriteCond %{REQUEST_URI} !(\.[a-zA-Z0-9]{1,5}|/|#(.*))$
RewriteRule ^(.*)$ /$1/ [R=301,L]
</IfModule>
# ----------------------------------------------------------------------
# Option 2:
# Rewrite "domain.com/foo/ -> domain.com/foo"
#<IfModule mod_rewrite.c>
# RewriteRule ^(.*)/$ /$1 [R=301,L]
#</IfModule>
# ----------------------------------------------------------------------
# Prevent 404 errors for non-existing redirected folders
# ----------------------------------------------------------------------
# without -MultiViews, Apache will give a 404 for a rewrite if a folder of the same name does not exist
# e.g. /blog/hello : webmasterworld.com/apache/3808792.htm
Options -MultiViews
# ----------------------------------------------------------------------
# custom 404 page
# ----------------------------------------------------------------------
# You can add custom pages to handle 500 or 403 pretty easily, if you like.
ErrorDocument 404 /404.html
# ----------------------------------------------------------------------
# UTF-8 encoding
# ----------------------------------------------------------------------
# use utf-8 encoding for anything served text/plain or text/html
AddDefaultCharset utf-8
# force utf-8 for a number of file formats
AddCharset utf-8 .html .css .js .xml .json .rss
# ----------------------------------------------------------------------
# A little more security
# ----------------------------------------------------------------------
# Do we want to advertise the exact version number of Apache we're running?
# Probably not.
## This can only be enabled if used in httpd.conf - It will not work in .htaccess
ServerTokens Prod
# "-Indexes" will have Apache block users from browsing folders without a default document
# Usually you should leave this activated, because you shouldn't allow everybody to surf through
# every folder on your server (which includes rather private places like CMS system folders).
Options -Indexes
# Block access to "hidden" directories whose names begin with a period. This
# includes directories used by version control systems such as Subversion or Git.
<IfModule mod_rewrite.c>
RewriteRule "(^|/)\." - [F]
</IfModule>
<% if @site.config[:redirects] %>
# Set up URL redirects<% @site.config[:redirects].each do |h| %>
Redirect 301 <%= h[:from] %> <%= h[:to] %>
<% end %><% end %>

View File

@ -15,6 +15,7 @@ unless defined? LOADED_DEFAULT_CONFIG
include Nanoc3::Helpers::HTMLEscape
include Nanoc3::Helpers::Rendering
include Nanoc3::Helpers::LinkTo
include Nanoc3::Helpers::XMLSitemap
# cachebuster
require 'nanoc/cachebuster'

92
lib/preprocessors.rb Normal file
View File

@ -0,0 +1,92 @@
# Preprocessor helpers
#
# This file has a collection of methods that are meant to be used in the
# preprocess-block in the Nanoc Rules file.
#
# @author Arjan van der Gaag
# Generate a sitemap.xml file using Nanoc's own xml_sitemap helper method by
# dynamically adding a new item.
#
# Make items that should not appear in the sitemap hidden. This by default
# works on all image files and typical assets, as well as error pages and
# htaccess. The is_hidden attribute is only explicitly set if it is absent,
# allowing per-file overriding.
#
# @todo extract hidden file types into configuration file?
def create_sitemap
return unless @site.config[:output_generated_assets]
@items.each do |item|
if %w{png gif jpg jpeg coffee scss sass less css xml js txt ico}.include?(item[:extension]) ||
item.identifier =~ /404|500|htaccess/
item[:is_hidden] = true unless item.attributes.has_key?(:is_hidden)
end
end
@items << Nanoc3::Item.new(
"<%= xml_sitemap %>",
{ :extension => 'xml', :is_hidden => true },
'/sitemap/'
)
end
# Use special settings from the site configuration to generate the files
# necessary for various webmaster tools authentications, such as the services
# from Google, Yahoo and Bing.
#
# This loops through all the items in the `webmaster_tools` setting, using
# its properties to generate a new item.
#
# See config.yaml for more documentation on the input format.
def create_webmaster_tools_authentications
return unless @site.config[:output_generated_assets]
@site.config[:webmaster_tools].each do |file|
next if file[:identifier].nil?
content = file.delete(:content)
identifier = file.delete(:identifier)
file.merge({ :is_hidden => true })
@items << Nanoc3::Item.new(
content,
file,
identifier
)
end
end
# Generate a robots.txt file in the root of the site by dynamically creating
# a new item.
#
# This will either output a default robots.txt file, that disallows all
# assets except images, and points to the sitemap file.
#
# You can override the contents of the output of this method using the site
# configuration, specifying Allow and Disallow directives. See the config.yaml
# file for more information on the expected input format.
def create_robots_txt
return unless @site.config[:output_generated_assets]
if @site.config[:robots]
content = if @site.config[:robots][:default]
<<-EOS
User-agent: *
Disallow: /assets
Allow: /assets/images
Sitemap: #{@site.config[:base_url]}/sitemap.xml
EOS
else
[
'User-Agent: *',
@site.config[:robots][:disallow].map { |l| "Disallow: #{l}" },
(@site.config[:robots][:allow] || []).map { |l| "Allow: #{l}" },
"Sitemap: #{@site.config[:robots][:sitemap]}"
].flatten.compact.join("\n")
end
@items << Nanoc3::Item.new(
content,
{ :extension => 'txt', :is_hidden => true },
'/robots/'
)
end
end

31
lib/shellcmd_filter.rb Normal file
View File

@ -0,0 +1,31 @@
require 'open3'
# This nanoc filter is a general purpose filter that simply pipes
# the contents of an item into a given shell command, and sets
# the items output to the output of it.
#
# It is NOT safe to use on large inputs, which will cause I/O
# deadlocks. Any safer implementation is encouraged.
#
# Usage:
#
# compile '/static/js/*/' do
# # minify JS :)
# filter :shellcmd, "java -jar js-compiler.jar"
# end
#
# Written by Vincent Driessen (http://twitter.com/nvie) and
# released to the public domain.
#
# http://nvie.com
class ShellCmdFilter < Nanoc3::Filter
identifier :shellcmd
def run(content, params={ :cmd => "sed s/foo/bar/" })
Open3.popen3(params[:cmd]) do |stdin, stdout, stderr|
stdin.write(content)
stdin.close()
stdout.read()
end
end
end

View File

@ -24,6 +24,43 @@ title: 'ariejan.net'
author_name: 'Ariejan de Vroom'
author_uri: 'http://ariejan.net'
# Configure the robots.txt file for this site.
# Setting 'default' to true-ish will use sensible defaults. If you
# wish to customize it, you can list paths to allow and to disallow.
# Finally, you could manually set the path to the sitemap file.
#
# You can customize the robots file fairly well like this, but you
# can always manually create a content file with the exact contents
# you need.
robots:
default: true # disallow assets, allow assets/images and point at sitemap
# disallow:
# - '/tag'
# - '/newsletter'
# allow:
# - '/tag/foo'
# sitemap: '/site-map.txt'
# Set up authentication files for various webmaster tools (or something
# similar). This simply creates a plain text file when generating the site.
#
# identifier: identifier of the output file, e.g. '/google12345/'
# content: content of the file, e.g. 'aoa8202ns001'
# extension: extension of the output file, e.g. 'html' or 'xml'
webmaster_tools:
-
identifier:
content:
extension:
# Should Nanoc generate some standard asset files for you, or skip them
# completely?
#
# This currently controls the generation of robots.txt, sitemap.xml,
# sitemap.xml.gz and webmaster tools authentication files.
output_generated_assets: true
prune:
# Whether to automatically remove files not managed by nanoc from the output
# directory. For safety reasons, this is turned off by default.