www-builder/build.py

#!/usr/bin/env python3

# Assumes that there's a directory named ~/src/www-home which is a git repo
# that the contents of output/ can be copied to, committed, & pushed to the
# production server.

# TODO: replace gallery.tinyletterapp.com images with locally hosted content.
# TODO: in template.html, add apple touch icon, maybe other favicon sizes.
# TODO: local mirrors of all papers in publications.html
# TODO: just put the whole HTML content in the Atom feed?
# TODO: atom feed logo in the top-right.

# Requirements:
# sudo apt install python3-markdown
# sudo apt install python3-smartypants
# sudo apt install python3-bs4


import argparse
from bs4 import BeautifulSoup
import glob
import html
from io import StringIO
import markdown
import operator
import os
import re
import shutil


input_directory = 'content'
static_directory = 'static'
output_directory = 'output'
deploy_directory = '~/src/www-home'

md_extensions = [
    'fenced_code', 'codehilite', 'nl2br', 'toc', 'smarty', 'tables', 'linkify']

blog_entries = []


def print_file(in_file, out_file):
  print('%-62s -> %s' % (in_file, out_file))


def copy_static_files():
  for (dirpath, _, filenames) in os.walk(static_directory):
    for filename in filenames:
      source = os.path.join(dirpath, filename)
      out_path = dirpath.replace(static_directory, '', 1)
      out_path = out_path.lstrip('/')
      dest_dir = os.path.join(output_directory, out_path)
      os.makedirs(dest_dir, exist_ok=True)
      dest = os.path.join(dest_dir, filename)
      print_file(source, dest)
      shutil.copy2(source, dest)


def find_update_date(text):
  match = re.search(r'^\*?Posted (\d{4}-\d{2}-\d{2})', text, re.MULTILINE)
  if not match:
    return None
  return match.group(1)


def find_summary(html_content):
  text = BeautifulSoup(html_content, features='lxml').get_text()
  lines = text.split('\n')
  result = ' '.join(lines[2:4])
  return html.escape(result, quote=False)


def process_markdown_files():
  template = open('template.html').read()
  for (dirpath, _, filenames) in os.walk(input_directory):
    for filename in filenames:
      markdown_filename = os.path.join(dirpath, filename)
      if not markdown_filename.endswith('.md'):
        continue

      blog_entry = {}

      markdown_file = open(markdown_filename)
      text = markdown_file.read()
      markdown_file.close()

      if not text.startswith('# '):
        text = '# ' + text

      match = re.match(r'^(.*?)\n', text)
      if match:
        title = match.group(1).lstrip('# ')
      else:
        title = text

      blog_entry['title'] = html.escape(title, quote=False)

      title += ' | Colin McMillen'
      if markdown_filename == os.path.join(input_directory, 'index.md'):
        title = 'Colin McMillen'

      out_filename = os.path.basename(markdown_filename).replace('.md', '.html')

      out_dirpath = os.path.join(output_directory, dirpath)
      out_dirpath = out_dirpath.replace('/content', '', 1)
      out_fullpath = os.path.join(out_dirpath, out_filename)
      page_url = out_fullpath.replace('output/', '', 1)
      if page_url.endswith('index.html'):  # strip off index.html
        page_url = page_url[:-len('index.html')]

      html_content = markdown.markdown(
          text, extensions=md_extensions, output_format='html5')
      output = template.format(
          title=title, content=html_content, page_url=page_url)

      update_date = find_update_date(text)
      if update_date:
        blog_entry['url'] = 'https://www.mcmillen.dev/' + page_url
        blog_entry['date'] = update_date
        blog_entry['summary'] = find_summary(html_content)
        blog_entries.append(blog_entry)

      os.makedirs(out_dirpath, exist_ok=True)
      print_file(markdown_filename, out_fullpath)
      out_file = open(out_fullpath, 'w')
      out_file.write(output)
      out_file.close()


def make_sitemap():
  sitemap_command = ' '.join("""
find output -regextype posix-extended -regex '.*.(html|pdf)$' |
grep -v ^output/google |
grep -v ^output/drafts |
perl -pe 's|output|https://www.mcmillen.dev|'
> output/sitemap.txt""".split('\n'))
  print_file('', 'output/sitemap.txt')
  os.system(sitemap_command)


def make_atom_feed():
  atom_template = '''<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">

  <title>Colin McMillen's Blog</title>
  <link href="https://www.mcmillen.dev"/>
  <link rel="self" href="https://www.mcmillen.dev/atom.xml"/>
  <updated>{last_update}</updated>
  <author>
    <name>Colin McMillen</name>
  </author>
  <id>https://www.mcmillen.dev/</id>

  {entries}
</feed>
'''

  entry_template = '''
  <entry>
    <title>{title}</title>
    <id>{url}</id>
    <link rel="alternate" href="{url}"/>
    <content type="text/html" src="{url}"/>
    <updated>{updated}</updated>
    <summary>{summary} (...)</summary>
  </entry>
'''

  blog_entries.sort(key=operator.itemgetter('date'))

  entries_io = StringIO()
  last_update = None
  for entry in blog_entries:
    # We lie and pretend that all entries were written at noon EST.
    update_date = entry['date'] + 'T12:00:00-04:00'
    last_update = update_date
    entries_io.write(entry_template.format(
        url=entry['url'],
        title=entry['title'],
        updated=update_date,
        summary=entry['summary']))

  entries_text = entries_io.getvalue()

  atom_feed = atom_template.format(
      last_update=last_update,
      entries=entries_io.getvalue())
  entries_io.close()

  atom_filename = os.path.join(output_directory, 'atom.xml')
  print_file('', atom_filename)
  atom_file = open(atom_filename, 'w')
  atom_file.write(atom_feed)
  atom_file.close()


def copy_site():
  os.system('cp -r output/* %s' % deploy_directory)


def deploy_site():
  copy_site()
  os.chdir(os.path.expanduser(deploy_directory))
  os.system('git add .')
  os.system('git commit -m "automated update from build.py"')
  os.system('git push')


def main():
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--clean', action='store_true',
      help='wipe the output directory before running')
  parser.add_argument(
      '--fast', action='store_true',
      help='only rebuild content files')
  parser.add_argument(
      '--copy', action='store_true',
      help='copy output files to www-home git repo')
  parser.add_argument(
      '--deploy', action='store_true',
      help='deploy the site by pushing the www-home git repo to production')
  args = parser.parse_args()

  if args.clean:
    shutil.rmtree(output_directory)
  os.makedirs(output_directory, exist_ok=True)
  if not args.fast:
    copy_static_files()
  process_markdown_files()
  make_sitemap()
  make_atom_feed()

  if args.copy and not args.deploy:
    copy_site()

  if args.deploy:
    if args.fast:
      print('cowardly refusing to deploy a site that was built with --fast')
    else:
      deploy_site()


if __name__ == '__main__':
  main()