bin/edit-csv-as-yaml

   1 #!/usr/bin/env python
   2 """Quick, dirty script that will convert a csv file to yaml, spawn an editor
   3 for you to fiddle with it, then convert back to csv and replace the original
   4 file.
   5
   6 Run me as: $0 some_file.csv
   7
   8 The editor used is $EDITOR, of course.
   9
  10 This script is not guaranteed to be even remotely reliable, so consider only
  11 using it on files in source control.
  12 """
  13
  14 import codecs
  15 import csv
  16 import os
  17 import subprocess
  18 import sys
  19 import tempfile
  20
  21 try:
  22     import yaml
  23 except ImportError:
  24     sys.stderr.write("Please install PyYAML.\n")
  25     sys.exit(13)
  26
  27 # Try to use ordered dicts, so the YAML keys are in database table order
  28 odict = dict  # fall back to regular dict
  29 try:
  30     from collections import OrderedDict as odict
  31 except ImportError:
  32     try:
  33         # This is a library for 2.4-2.6
  34         from ordereddict import OrderedDict as odict
  35     except ImportError:
  36         pass
  37
  38 # Tell PyYAML how to dump our ordered dict.
  39 # The items() is to avoid the sorting the library does automatically.
  40 # Needs to be added to SafeDumper manually, because we use safe_dump below, and
  41 # every Representer class has its own independent goddamn dict of these things
  42 from yaml.dumper import SafeDumper
  43 yaml.add_representer(
  44     odict,
  45     lambda dumper, data: dumper.represent_dict(data.items()),
  46     Dumper=SafeDumper,
  47 )
  48
  49 ### Do actual work!
  50 infilename, = sys.argv[1:]
  51
  52 data = []
  53 with open(infilename) as infile:
  54     reader = csv.reader(infile, lineterminator='\n')
  55     column_names = [unicode(column) for column in next(reader)]
  56
  57     # Read data...
  58     for row in reader:
  59         datum = odict()
  60         for col, value in zip(column_names, row):
  61             datum[col] = value.decode('utf-8')
  62
  63         data.append(datum)
  64
  65
  66 # Monkeypatch yaml to use > syntax for multiline text; easier to edit
  67 from yaml.emitter import Emitter
  68 orig_choose_scalar_style = Emitter.choose_scalar_style
  69 def new_choose_scalar_style(self):
  70     if self.analysis is None:
  71         self.analysis = self.analyze_scalar(self.event.value)
  72     if self.analysis.multiline or len(self.analysis.scalar) > 80:
  73         return '>'
  74     return orig_choose_scalar_style(self)
  75 Emitter.choose_scalar_style = new_choose_scalar_style
  76
  77 # Write to a tempfile
  78 with tempfile.NamedTemporaryFile(suffix='.yml') as tmp:
  79     yaml.safe_dump(data, tmp,
  80         default_flow_style=False,
  81         allow_unicode=True,
  82         indent=4,
  83     )
  84     del data  # reclaim rams!
  85
  86     error_line = ''  # used on errors
  87     while True:
  88         args = [os.environ['EDITOR'], tmp.name]
  89         if 'vim' in os.environ['EDITOR']:
  90             # vim has an arg for jumping to a line:
  91             args.append("+{0}".format(error_line))
  92
  93         # Run the user's editor and wait for it to close
  94         subprocess.Popen(args).wait()
  95         tmp.seek(0)
  96
  97         try:
  98             new_data = yaml.safe_load(tmp)
  99             break
 100         except yaml.YAMLError as e:
 101             if hasattr(e, 'problem_mark'):
 102                 error_line = e.problem_mark.line + 1
 103             else:
 104                 error_line = ''
 105
 106             print
 107             print "Oh my god what have you done:"
 108             print
 109             print str(e)
 110             print
 111             print "Press Enter to try again, or I guess ctrl-c to bail."
 112             raw_input()
 113
 114 with open(infilename, 'wb') as outfile:
 115     writer = csv.writer(outfile, lineterminator='\n')
 116     writer.writerow([ column.encode('utf8') for column in column_names ])
 117
 118     for datum in new_data:
 119         writer.writerow([
 120             datum[column].encode('utf8') for column in column_names
 121         ])