summaryrefslogtreecommitdiffstats
path: root/_controllers/blog/post.py
diff options
context:
space:
mode:
authorguyzmo <guyzmo@leloop.org>2011-01-14 11:51:54 +0100
committerguyzmo <guyzmo@leloop.org>2011-01-14 11:51:54 +0100
commit48c34a6f62e12f7cb2a7a710dc99cb8d53957cfe (patch)
treef2fd775016309f35ea57a1ee071a9d7231a193f3 /_controllers/blog/post.py
init
Diffstat (limited to '_controllers/blog/post.py')
-rw-r--r--_controllers/blog/post.py342
1 files changed, 342 insertions, 0 deletions
diff --git a/_controllers/blog/post.py b/_controllers/blog/post.py
new file mode 100644
index 0000000..0c7f698
--- /dev/null
+++ b/_controllers/blog/post.py
@@ -0,0 +1,342 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+post.py parses post sources from the ./_post directory.
+"""
+
+__author__ = "Ryan McGuire (ryan@enigmacurry.com)"
+__date__ = "Mon Feb 2 21:21:04 2009"
+
+import os
+import sys
+import datetime
+import re
+import operator
+import urlparse
+import hashlib
+import codecs
+
+import pytz
+import yaml
+import logging
+import BeautifulSoup
+
+import blogofile_bf as bf
+
+logger = logging.getLogger("blogofile.post")
+
+config = bf.config.controllers.blog.post
+config.mod = sys.modules[globals()["__name__"]]
+
+# These are all the Blogofile reserved field names for posts. It is not
+# recommended that users re-use any of these field names for purposes other
+# than the one stated.
+reserved_field_names = {
+ "title" :"A one-line free-form title for the post",
+ "date" :"The date that the post was originally created",
+ "updated" :"The date that the post was last updated",
+ "categories" :"A list of categories that the post pertains to, "\
+ "each seperated by commas",
+ "tags" :"A list of tags that the post pertains to, "\
+ "each seperated by commas",
+ "permalink" :"The full permanent URL for this post. "\
+ "Automatically created if not provided",
+ "path" :"The path from the permalink of the post",
+ "guid" :"A unique hash for the post, if not provided it "\
+ "is assumed that the permalink is the guid",
+ "slug" :"The title part of the URL for the post, if not "\
+ "provided it is automatically generated from the title."\
+ "It is not used if permalink does not contain :title",
+ "author" :"The name of the author of the post",
+ "filters" :"The filter chain to apply to the entire post. "\
+ "If not specified, a default chain based on the file extension is "\
+ "applied. If set to 'None' it disables all filters, even default ones.",
+ "filter" :"synonym for filters",
+ "draft" :"If 'true' or 'True', the post is considered to be only a "\
+ "draft and not to be published.",
+ "source" :"Reserved internally",
+ "yaml" :"Reserved internally",
+ "content" :"Reserved internally",
+ "filename" :"Reserved internally"
+ }
+
+
+class PostParseException(Exception):
+
+ def __init__(self, value):
+ self.value = value
+
+ def __str__(self):
+ return repr(self.value)
+
+
+class Post(object):
+ """
+ Class to describe a blog post and associated metadata
+ """
+ def __init__(self, source, filename="Untitled"):
+ self.source = source
+ self.yaml = None
+ self.title = None
+ self.__timezone = bf.config.controllers.blog.timezone
+ self.date = None
+ self.updated = None
+ self.categories = set()
+ self.tags = set()
+ self.permalink = None
+ self.content = u""
+ self.excerpt = u""
+ self.filename = filename
+ self.author = ""
+ self.guid = None
+ self.slug = None
+ self.draft = False
+ self.filters = None
+ self.__parse()
+ self.__post_process()
+
+ def __repr__(self): #pragma: no cover
+ return "<Post title='{0}' date='{1}'>".format(
+ self.title, self.date.strftime("%Y/%m/%d %H:%M:%S"))
+
+ def __parse(self):
+ """Parse the yaml and fill fields"""
+ yaml_sep = re.compile("^---$", re.MULTILINE)
+ content_parts = yaml_sep.split(self.source, maxsplit=2)
+ if len(content_parts) < 2:
+ raise PostParseException("{0}: Post has no YAML section".format(
+ self.filename))
+ else:
+ #Extract the yaml at the top
+ self.__parse_yaml(content_parts[1])
+ post_src = content_parts[2]
+ self.__apply_filters(post_src)
+ #Do post excerpting
+ self.__parse_post_excerpting()
+
+ def __apply_filters(self, post_src):
+ """Apply filters to the post"""
+ #Apply block level filters (filters on only part of the post)
+ # TODO: block level filters on posts
+ #Apply post level filters (filters on the entire post)
+ #If filter is unspecified, use the default filter based on
+ #the file extension:
+ if self.filters is None:
+ try:
+ file_extension = os.path.splitext(self.filename)[-1][1:]
+ self.filters = bf.config.controllers.blog.post_default_filters[
+ file_extension]
+ except KeyError:
+ self.filters = []
+ self.content = bf.filter.run_chain(self.filters, post_src)
+
+ def __parse_post_excerpting(self):
+ if bf.config.controllers.blog.post_excerpts.enabled:
+ length = bf.config.controllers.blog.post_excerpts.word_length
+ try:
+ self.excerpt = bf.config.post_excerpt(self.content, length)
+ except AttributeError:
+ self.excerpt = self.__excerpt(length)
+
+ def __excerpt(self, num_words=50):
+ #Default post excerpting function
+ #Can be overridden in _config.py by
+ #defining post_excerpt(content,num_words)
+ if len(self.excerpt) == 0:
+ """Retrieve excerpt from article"""
+ s = BeautifulSoup.BeautifulSoup(self.content)
+ # get rid of javascript, noscript and css
+ [[tree.extract() for tree in s(elem)] for elem in (
+ 'script', 'noscript', 'style')]
+ # get rid of doctype
+ subtree = s.findAll(text=re.compile("DOCTYPE|xml"))
+ [tree.extract() for tree in subtree]
+ # remove headers
+ [[tree.extract() for tree in s(elem)] for elem in (
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')]
+ text = ''.join(s.findAll(text=True))\
+ .replace("\n", "").split(" ")
+ return " ".join(text[:num_words]) + '...'
+
+ def __post_process(self):
+ # fill in empty default value
+ if not self.title:
+ self.title = u"Untitled - {0}".format(
+ datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+
+ if not self.slug:
+ self.slug = re.sub("[ ?]", "-", self.title).lower()
+
+ if not self.date:
+ self.date = datetime.datetime.now(pytz.timezone(self.__timezone))
+ if not self.updated:
+ self.updated = self.date
+
+ if not self.categories or len(self.categories) == 0:
+ self.categories = set([Category('Uncategorized')])
+ if not self.permalink and \
+ bf.config.controllers.blog.auto_permalink.enabled:
+ self.permalink = bf.config.site.url.rstrip("/") + \
+ bf.config.controllers.blog.auto_permalink.path
+ self.permalink = \
+ re.sub(":blog_path", bf.config.blog.path, self.permalink)
+ self.permalink = \
+ re.sub(":year", self.date.strftime("%Y"), self.permalink)
+ self.permalink = \
+ re.sub(":month", self.date.strftime("%m"), self.permalink)
+ self.permalink = \
+ re.sub(":day", self.date.strftime("%d"), self.permalink)
+ self.permalink = \
+ re.sub(":title", self.slug, self.permalink)
+
+ # TODO: slugification should be abstracted out somewhere reusable
+ self.permalink = re.sub(
+ ":filename", re.sub(
+ "[ ?]", "-", self.filename).lower(), self.permalink)
+
+ # Generate sha hash based on title
+ self.permalink = re.sub(":uuid", hashlib.sha1(
+ self.title.encode('utf-8')).hexdigest(), self.permalink)
+
+ logger.debug("Permalink: {0}".format(self.permalink))
+
+ def __parse_yaml(self, yaml_src):
+ y = yaml.load(yaml_src)
+ # Load all the fields that require special processing first:
+ fields_need_processing = ('permalink', 'guid', 'date', 'updated',
+ 'categories', 'tags', 'draft')
+ try:
+ self.permalink = y['permalink']
+ if self.permalink.startswith("/"):
+ self.permalink = urlparse.urljoin(bf.config.site.url,
+ self.permalink)
+ #Ensure that the permalink is for the same site as bf.config.site.url
+ if not self.permalink.startswith(bf.config.site.url):
+ raise PostParseException("{0}: permalink for a different site"
+ " than configured".format(self.filename))
+ logger.debug("path from permalink: {0}".format(self.path))
+ except KeyError:
+ pass
+ try:
+ self.guid = y['guid']
+ except KeyError:
+ self.guid = self.permalink
+ try:
+ self.date = pytz.timezone(self.__timezone).localize(
+ datetime.datetime.strptime(y['date'], config.date_format))
+ except KeyError:
+ pass
+ try:
+ self.updated = pytz.timezone(self.__timezone).localize(
+ datetime.datetime.strptime(y['updated'], config.date_format))
+ except KeyError:
+ pass
+ try:
+ self.categories = set([Category(x.strip()) for x in \
+ y['categories'].split(",")])
+ except:
+ pass
+ try:
+ self.tags = set([x.strip() for x in y['tags'].split(",")])
+ except:
+ pass
+ try:
+ self.filters = y['filter'] #filter is a synonym for filters
+ except KeyError:
+ pass
+ try:
+ if y['draft']:
+ self.draft = True
+ logger.info("Post {0} is set to draft, "
+ "ignoring this post".format(self.filename))
+ else:
+ self.draft = False
+ except KeyError:
+ self.draft = False
+ # Load the rest of the fields that don't need processing:
+ for field, value in y.items():
+ if field not in fields_need_processing:
+ setattr(self,field,value)
+
+ def permapath(self):
+ """Get just the path portion of a permalink"""
+ return urlparse.urlparse(self.permalink)[2]
+
+ def __cmp__(self, other_post):
+ "Posts should be comparable by date"
+ return cmp(self.date, other_post.date)
+
+ def __eq__(self, other_post):
+ return self is other_post
+
+ def __getattr__(self, name):
+ if name == "path":
+ #Always generate the path from the permalink
+ return self.permapath()
+ else:
+ raise AttributeError, name
+
+
+class Category(object):
+
+ def __init__(self, name):
+ self.name = unicode(name)
+ # TODO: slugification should be abstracted out somewhere reusable
+ # TODO: consider making url_name and path read-only properties?
+ self.url_name = self.name.lower().replace(" ", "-")
+ self.path = bf.util.site_path_helper(
+ bf.config.controllers.blog.path,
+ bf.config.controllers.blog.category_dir,
+ self.url_name)
+
+ def __eq__(self, other):
+ if self.name == other.name:
+ return True
+ return False
+
+ def __hash__(self):
+ return hash(self.name)
+
+ def __repr__(self):
+ return self.name
+
+ def __cmp__(self, other):
+ return cmp(self.name, other.name)
+
+
+def parse_posts(directory):
+ """Retrieve all the posts from the directory specified.
+
+ Returns a list of the posts sorted in reverse by date."""
+ posts = []
+ post_filename_re = re.compile(
+ ".*((\.textile$)|(\.markdown$)|(\.org$)|(\.html$)|(\.txt$)|(\.rst$))")
+ if not os.path.isdir("_posts"):
+ logger.warn("This site has no _posts directory.")
+ return []
+ post_paths = [f for f in bf.util.recursive_file_list(
+ directory, post_filename_re) if post_filename_re.match(f)]
+
+ for post_path in post_paths:
+ post_fn = os.path.split(post_path)[1]
+ logger.debug("Parsing post: {0}".format(post_path))
+ #IMO codecs.open is broken on Win32.
+ #It refuses to open files without replacing newlines with CR+LF
+ #reverting to regular open and decode:
+ try:
+ src = open(post_path, "r").read().decode(
+ bf.config.controllers.blog.post_encoding)
+ except:
+ logger.exception("Error reading post: {0}".format(post_path))
+ raise
+ try:
+ p = Post(src, filename=post_fn)
+ except PostParseException as e:
+ logger.warning("{0} : Skipping this post.".format(e.value))
+ continue
+ #Exclude some posts
+ if not (p.permalink is None or p.draft is True):
+ posts.append(p)
+ posts.sort(key=operator.attrgetter('date'), reverse=True)
+ return posts