#!/usr/bin/env python3 # Assumes that there's a directory named ~/src/www-home which is a git repo # that the contents of output/ can be copied to, committed, & pushed to the # production server. # TODO: replace gallery.tinyletterapp.com images with locally hosted content. # TODO: in template.html, add apple touch icon, maybe other favicon sizes. # TODO: local mirrors of all papers in publications.html # TODO: just put the whole HTML content in the Atom feed? # TODO: atom feed logo in the top-right. # Requirements: # sudo apt install python3-markdown # sudo apt install python3-smartypants # sudo apt install python3-bs4 import argparse from bs4 import BeautifulSoup import glob import html from io import StringIO import markdown import operator import os import re import shutil input_directory = 'content' static_directory = 'static' output_directory = 'output' deploy_directory = '~/src/www-home' md_extensions = [ 'fenced_code', 'codehilite', 'nl2br', 'toc', 'smarty', 'tables', 'linkify'] blog_entries = [] def print_file(in_file, out_file): print('%-62s -> %s' % (in_file, out_file)) def copy_static_files(): for (dirpath, _, filenames) in os.walk(static_directory): for filename in filenames: source = os.path.join(dirpath, filename) out_path = dirpath.replace(static_directory, '', 1) out_path = out_path.lstrip('/') dest_dir = os.path.join(output_directory, out_path) os.makedirs(dest_dir, exist_ok=True) dest = os.path.join(dest_dir, filename) print_file(source, dest) shutil.copy2(source, dest) def find_update_date(text): match = re.search(r'^\*?Posted (\d{4}-\d{2}-\d{2})', text, re.MULTILINE) if not match: return None return match.group(1) def find_summary(html_content): text = BeautifulSoup(html_content, features='lxml').get_text() lines = text.split('\n') result = ' '.join(lines[2:4]) return html.escape(result, quote=False) def process_markdown_files(): template = open('template.html').read() for (dirpath, _, filenames) in os.walk(input_directory): for filename in filenames: markdown_filename = os.path.join(dirpath, filename) if not markdown_filename.endswith('.md'): continue blog_entry = {} markdown_file = open(markdown_filename) text = markdown_file.read() markdown_file.close() if not text.startswith('# '): text = '# ' + text match = re.match(r'^(.*?)\n', text) if match: title = match.group(1).lstrip('# ') else: title = text blog_entry['title'] = html.escape(title, quote=False) title += ' | Colin McMillen' if markdown_filename == os.path.join(input_directory, 'index.md'): title = 'Colin McMillen' out_filename = os.path.basename(markdown_filename).replace('.md', '.html') out_dirpath = os.path.join(output_directory, dirpath) out_dirpath = out_dirpath.replace('/content', '', 1) out_fullpath = os.path.join(out_dirpath, out_filename) page_url = out_fullpath.replace('output/', '', 1) if page_url.endswith('index.html'): # strip off index.html page_url = page_url[:-len('index.html')] html_content = markdown.markdown( text, extensions=md_extensions, output_format='html5') output = template.format( title=title, content=html_content, page_url=page_url) update_date = find_update_date(text) if update_date: blog_entry['url'] = 'https://www.mcmillen.dev/' + page_url blog_entry['date'] = update_date blog_entry['summary'] = find_summary(html_content) blog_entries.append(blog_entry) os.makedirs(out_dirpath, exist_ok=True) print_file(markdown_filename, out_fullpath) out_file = open(out_fullpath, 'w') out_file.write(output) out_file.close() def make_sitemap(): sitemap_command = ' '.join(""" find output -regextype posix-extended -regex '.*.(html|pdf)$' | grep -v ^output/google | grep -v ^output/drafts | perl -pe 's|output|https://www.mcmillen.dev|' > output/sitemap.txt""".split('\n')) print_file('', 'output/sitemap.txt') os.system(sitemap_command) def make_atom_feed(): atom_template = ''' Colin McMillen's Blog {last_update} Colin McMillen https://www.mcmillen.dev/ {entries} ''' entry_template = ''' {title} {url} {updated} {summary} (...) ''' blog_entries.sort(key=operator.itemgetter('date')) entries_io = StringIO() last_update = None for entry in blog_entries: # We lie and pretend that all entries were written at noon EST. update_date = entry['date'] + 'T12:00:00-04:00' last_update = update_date entries_io.write(entry_template.format( url=entry['url'], title=entry['title'], updated=update_date, summary=entry['summary'])) entries_text = entries_io.getvalue() atom_feed = atom_template.format( last_update=last_update, entries=entries_io.getvalue()) entries_io.close() atom_filename = os.path.join(output_directory, 'atom.xml') print_file('', atom_filename) atom_file = open(atom_filename, 'w') atom_file.write(atom_feed) atom_file.close() def copy_site(): os.system('cp -r output/* %s' % deploy_directory) def deploy_site(): copy_site() os.chdir(os.path.expanduser(deploy_directory)) os.system('git add .') os.system('git commit -m "automated update from build.py"') os.system('git push') def main(): parser = argparse.ArgumentParser() parser.add_argument( '--clean', action='store_true', help='wipe the output directory before running') parser.add_argument( '--fast', action='store_true', help='only rebuild content files') parser.add_argument( '--copy', action='store_true', help='copy output files to www-home git repo') parser.add_argument( '--deploy', action='store_true', help='deploy the site by pushing the www-home git repo to production') args = parser.parse_args() if args.clean: shutil.rmtree(output_directory) os.makedirs(output_directory, exist_ok=True) if not args.fast: copy_static_files() process_markdown_files() make_sitemap() make_atom_feed() if args.copy and not args.deploy: copy_site() if args.deploy: if args.fast: print('cowardly refusing to deploy a site that was built with --fast') else: deploy_site() if __name__ == '__main__': main()