1 """CSV to database or vice versa."""
8 from sqlalchemy
.orm
.attributes
import instrumentation_registry
9 import sqlalchemy
.sql
.util
10 import sqlalchemy
.types
12 from pokedex
.db
import metadata
13 import pokedex
.db
.tables
as tables
16 def _wildcard_char_to_regex(char
):
17 """Converts a single wildcard character to the regex equivalent."""
24 return re
.escape(char
)
26 def _wildcard_glob_to_regex(glob
):
27 """Converts a single wildcard glob to a regex STRING."""
29 # If it looks like a filename, make it not one
30 if '.' in glob
or '/' in glob
:
31 _
, filename
= os
.path
.split(glob
)
32 table_name
, _
= os
.path
.splitext(filename
)
35 return u
''.join(map(_wildcard_char_to_regex
, glob
))
37 def _wildcards_to_regex(strings
):
38 """Converts a list of wildcard globs to a single regex object."""
40 regex_parts
= map(_wildcard_glob_to_regex
, strings
)
42 regex
= '^(?:' + '|'.join(regex_parts
) + ')$'
44 return re
.compile(regex
)
47 def _get_verbose_prints(verbose
):
48 """If `verbose` is true, returns three functions: one for printing a
49 starting message, one for printing an interim status update, and one for
50 printing a success or failure message when finished.
52 If `verbose` is false, returns no-op functions.
57 def dummy(*args
, **kwargs
):
60 return dummy
, dummy
, dummy
62 ### Okay, verbose == True; print stuff
64 def print_start(thing
):
65 # Truncate to 66 characters, leaving 10 characters for a success
67 truncated_thing
= thing
[0:66]
69 # Also, space-pad to keep the cursor in a known column
70 num_spaces
= 66 - len(truncated_thing
)
72 print "%s...%s" %
(truncated_thing
, ' ' * num_spaces
),
75 if sys
.stdout
.isatty():
76 # stdout is a terminal; stupid backspace tricks are OK.
77 # Don't use print, because it always adds magical spaces, which
78 # makes backspace accounting harder
81 def print_status(msg
):
82 # Overwrite any status text with spaces before printing
83 sys
.stdout
.write('\b' * backspaces
[0])
84 sys
.stdout
.write(' ' * backspaces
[0])
85 sys
.stdout
.write('\b' * backspaces
[0])
88 backspaces
[0] = len(msg
)
90 def print_done(msg
='ok'):
91 # Overwrite any status text with spaces before printing
92 sys
.stdout
.write('\b' * backspaces
[0])
93 sys
.stdout
.write(' ' * backspaces
[0])
94 sys
.stdout
.write('\b' * backspaces
[0])
95 sys
.stdout
.write(msg
+ "\n")
100 # stdout is a file (or something); don't bother with status at all
101 def print_status(msg
):
104 def print_done(msg
='ok'):
107 return print_start
, print_status
, print_done
110 def load(session
, tables
=[], directory
=None, drop_tables
=False, verbose
=False):
111 """Load data from CSV files into the given database session.
113 Tables are created automatically.
116 SQLAlchemy session to use.
119 List of tables to load. If omitted, all tables are loaded.
122 Directory the CSV files reside in. Defaults to the `pokedex` data
126 If set to True, existing `pokedex`-related tables will be dropped.
129 If set to True, status messages will be printed to stdout.
132 # First take care of verbosity
133 print_start
, print_status
, print_done
= _get_verbose_prints(verbose
)
137 directory
= pkg_resources
.resource_filename('pokedex', 'data/csv')
140 regex
= _wildcards_to_regex(tables
)
141 table_names
= filter(regex
.match
, metadata
.tables
.keys())
143 table_names
= metadata
.tables
.keys()
145 table_objs
= [metadata
.tables
[name
] for name
in table_names
]
146 table_objs
= sqlalchemy
.sql
.util
.sort_tables(table_objs
)
149 # Drop all tables if requested
151 print_start('Dropping tables')
152 for table
in reversed(table_objs
):
153 table
.drop(checkfirst
=True)
156 for table
in table_objs
:
158 connection
= session
.connection()
160 # Okay, run through the tables and actually load the data now
161 for table_obj
in table_objs
:
162 table_name
= table_obj
.name
163 insert_stmt
= table_obj
.insert()
165 print_start(table_name
)
168 csvpath
= "%s/%s.csv" %
(directory
, table_name
)
169 csvfile
= open(csvpath
, 'rb')
171 # File doesn't exist; don't load anything!
172 print_done('missing?')
175 csvsize
= os
.stat(csvpath
).st_size
177 reader
= csv
.reader(csvfile
, lineterminator
='\n')
178 column_names
= [unicode(column
) for column
in reader
.next()]
180 # Self-referential tables may contain rows with foreign keys of other
181 # rows in the same table that do not yet exist. Pull these out and add
182 # them to the session last
183 # ASSUMPTION: Self-referential tables have a single PK called "id"
184 deferred_rows
= [] # ( row referring to id, [foreign ids we need] )
185 seen_ids
= {} # primary key we've seen => 1
187 # Fetch foreign key columns that point at this table, if any
188 self_ref_columns
= []
189 for column
in table_obj
.c
:
190 if any(_
.references(table_obj
) for _
in column
.foreign_keys
):
191 self_ref_columns
.append(column
)
194 def insert_and_commit():
195 session
.connection().execute(insert_stmt
, new_rows
)
199 progress
= "{0}%".format(100 * csvfile
.tell() // csvsize
)
200 print_status(progress
)
205 for column_name
, value
in zip(column_names
, csvs
):
206 column
= table_obj
.c
[column_name
]
207 if column
.nullable
and value
== '':
208 # Empty string in a nullable column really means NULL
210 elif isinstance(column
.type, sqlalchemy
.types
.Boolean
):
211 # Boolean values are stored as string values 0/1, but both
212 # of those evaluate as true; SQLA wants True/False
218 # Otherwise, unflatten from bytes
219 value
= value
.decode('utf-8')
221 # nb: Dictionaries flattened with ** have to have string keys
222 row_data
[ str(column_name
) ] = value
224 # May need to stash this row and add it later if it refers to a
225 # later row in this table
227 foreign_ids
= [row_data
[_
.name
] for _
in self_ref_columns
]
228 foreign_ids
= [_
for _
in foreign_ids
if _
] # remove NULL ids
231 # NULL key. Remember this row and add as usual.
232 seen_ids
[row_data
['id']] = 1
234 elif all(_
in seen_ids
for _
in foreign_ids
):
235 # Non-NULL key we've already seen. Remember it and commit
236 # so we know the old row exists when we add the new one
238 seen_ids
[row_data
['id']] = 1
241 # Non-NULL future id. Save this and insert it later!
242 deferred_rows
.append((row_data
, foreign_ids
))
246 new_rows
.append(row_data
)
248 # Remembering some zillion rows in the session consumes a lot of
249 # RAM. Let's not do that. Commit every 1000 rows
250 if len(new_rows
) >= 1000:
255 # Attempt to add any spare rows we've collected
256 for row_data
, foreign_ids
in deferred_rows
:
257 if not all(_
in seen_ids
for _
in foreign_ids
):
258 # Could happen if row A refers to B which refers to C.
259 # This is ridiculous and doesn't happen in my data so far
260 raise ValueError("Too many levels of self-reference! "
261 "Row was: " + str(row
))
263 session
.connection().execute(
264 insert_stmt
.values(**row_data
)
266 seen_ids
[row_data
['id']] = 1
273 def dump(session
, tables
=[], directory
=None, verbose
=False):
274 """Dumps the contents of a database to a set of CSV files. Probably not
275 useful to anyone besides a developer.
278 SQLAlchemy session to use.
281 List of tables to dump. If omitted, all tables are dumped.
284 Directory the CSV files should be put in. Defaults to the `pokedex`
288 If set to True, status messages will be printed to stdout.
291 # First take care of verbosity
292 print_start
, print_status
, print_done
= _get_verbose_prints(verbose
)
296 directory
= pkg_resources
.resource_filename('pokedex', 'data/csv')
299 regex
= _wildcards_to_regex(tables
)
300 table_names
= filter(regex
.match
, metadata
.tables
.keys())
302 table_names
= metadata
.tables
.keys()
307 for table_name
in table_names
:
308 print_start(table_name
)
309 table
= metadata
.tables
[table_name
]
311 writer
= csv
.writer(open("%s/%s.csv" %
(directory
, table_name
), 'wb'),
313 columns
= [col
.name
for col
in table
.columns
]
314 writer
.writerow(columns
)
316 primary_key
= table
.primary_key
317 for row
in session
.query(table
).order_by(*primary_key
).all():
320 # Convert Pythony values to something more universal
321 val
= getattr(row
, col
)
329 val
= unicode(val
).encode('utf-8')
333 writer
.writerow(csvs
)