_controllers/org.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
org.py convert org source file into html file 
"""

__author__ = "Jaemok Jeong(jmjeong@gmail.com)"
__date__   = "Tue Aug 11 12:50:17 2009"


import os
import tempfile
import logging
import re
import sys
import commands
import codecs
import datetime
import pytz
from BeautifulSoup import BeautifulSoup

import blogofile_bf as bf

logger = logging.getLogger("blogofile.org")


class EmacsNotFoundException(Exception):
    pass


post = bf.config.controllers.blog.post.mod


class org(object):
    """
        Class to Convert org file into html file

        It composes org-content with source, preamble, and postample.
        Launches emacs and convert the org-content into html file.

        Generated html file is processed with BeautifulSoup module to
        extract body section and title and categories.

        self.content  = body
        self.title    = title (which is first '*' in org-file)
        self.category = categories (which is tags in first '*' in org-file)
        self.date     = date (which is scheduled file?)

	"""
    def __init__(self, source):
        self.source = source
        return self.__convert()
        
    def __convert(self):
        temp_file = tempfile.NamedTemporaryFile(suffix='.org')
        try:
            temp_file.write(bf.config.blog.emacs_orgmode_preamble)
            temp_file.write("\n")
        except AttributeError:
            pass
        temp_file.write(self.source.encode(bf.config.blog_post_encoding))
        temp_file.flush()

        pname = ""
        try:
            pname = bf.config.blog.emacs_binary
        except AttributeError:
            raise EmacsNotFoundException("Emacs binary is not defined")

        pname += " --batch"
        try:
            if bf.config.blog.emacs_preload_elisp:
                pname += " --load={0}".format(
                        bf.config.blog.emacs_preload_elisp)
        except AttributeError:
            pass

        pname += " --visit={0} --funcall org-export-as-html-batch".format(
                temp_file.name)
        logger.debug("Exec name::: %s" % pname)

        status, output = commands.getstatusoutput(pname)
        logger.debug("Convert output:::\n\t%s"%output)
        if status:
            raise EmacsNotFoundException("orgfile filter failed")
        
        html = temp_file.name[:-4] + '.html'
        temp_file.close()

        #IMO codecs.open is broken on Win32.
        #It refuses to open files without replacing newlines with CR+LF
        #reverting to regular open and decode:
        content = open(html, "rb").read().decode(bf.config.blog_post_encoding)

        # remote the temporary file
        os.remove(html)

        soup = BeautifulSoup(content)

        # the first h2 section will be used for title, category, and date
        metaline = soup.find('div', {'id': 'outline-container-1'}).h2

        # extract title
        try:
            self.title = re.sub('&nbsp;', '', metaline.contents[0]).strip()
        except AttributeError:
            self.title = None

        # extract category
        try:
            categories = metaline('span', {'class':'tag'})[0].string
            self.categories = set([post.Category(x)
                    for x in categories.split('&nbsp;')])
        except:
            self.categories = None

        # extract date
        try:
            date = metaline('span', {'class':'timestamp'})[0].string # 2009-08-22 Sat 15:22
            # date_format = "%Y/%m/%d %H:%M:%S"
            self.date = datetime.datetime.strptime(date, "%Y-%m-%d %a %H:%M")
            self.date = self.date.replace(
                    tzinfo=pytz.timezone(bf.config.blog_timezone))
        except:
            self.date = None

        # delete first h2 section (which is title and category)
        try:
            metaline.extract()
        except AttributeError:
            pass

        # print soup.body
        try:
            toc = soup.find('div',{'id': 'table-of-contents'})
            content = soup.find('div', {'id': 'outline-container-1'})

            if toc != None:
                content = str(toc) + str(content)
                
            self.content = str(content).decode(bf.config.blog_post_encoding)
        except:
            pass


if __name__ == '__main__':
    import doctest
    doctest.testmod(verbose=True)