0425deafdf8ee6d22c3ad0939067bfd0aad96ceb
1 """CSV to database or vice versa."""
8 from sqlalchemy
.orm
.attributes
import instrumentation_registry
9 import sqlalchemy
.sql
.util
10 import sqlalchemy
.types
12 from pokedex
.db
import metadata
13 import pokedex
.db
.tables
as tables
16 def _wildcard_char_to_regex(char
):
17 """Converts a single wildcard character to the regex equivalent."""
24 return re
.escape(char
)
26 def _wildcard_glob_to_regex(glob
):
27 """Converts a single wildcard glob to a regex STRING."""
29 # If it looks like a filename, make it not one
30 if '.' in glob
or '/' in glob
:
31 _
, filename
= os
.path
.split(glob
)
32 table_name
, _
= os
.path
.splitext(filename
)
35 return u
''.join(map(_wildcard_char_to_regex
, glob
))
37 def _wildcards_to_regex(strings
):
38 """Converts a list of wildcard globs to a single regex object."""
40 regex_parts
= map(_wildcard_glob_to_regex
, strings
)
42 regex
= '^(?:' + '|'.join(regex_parts
) + ')$'
44 return re
.compile(regex
)
47 def _get_verbose_prints(verbose
):
48 """If `verbose` is true, returns two functions: one for printing a starting
49 message, and the other for printing a success or failure message when
52 If `verbose` is false, returns two no-op functions.
57 def print_start(thing
):
58 # Truncate to 66 characters, leaving 10 characters for a success
60 truncated_thing
= thing
[0:66]
62 # Also, space-pad to keep the cursor in a known column
63 num_spaces
= 66 - len(truncated_thing
)
65 print "%s...%s" %
(truncated_thing
, ' ' * num_spaces
),
68 def print_done(msg
='ok'):
72 return print_start
, print_done
74 # Not verbose; return dummies
75 def dummy(*args
, **kwargs
):
81 def load(session
, tables
=[], directory
=None, drop_tables
=False, verbose
=False):
82 """Load data from CSV files into the given database session.
84 Tables are created automatically.
87 SQLAlchemy session to use.
90 List of tables to load. If omitted, all tables are loaded.
93 Directory the CSV files reside in. Defaults to the `pokedex` data
97 If set to True, existing `pokedex`-related tables will be dropped.
100 If set to True, status messages will be printed to stdout.
103 # First take care of verbosity
104 print_start
, print_done
= _get_verbose_prints(verbose
)
108 directory
= pkg_resources
.resource_filename('pokedex', 'data/csv')
111 regex
= _wildcards_to_regex(tables
)
112 table_names
= filter(regex
.match
, metadata
.tables
.keys())
114 table_names
= metadata
.tables
.keys()
116 table_objs
= [metadata
.tables
[name
] for name
in table_names
]
117 table_objs
= sqlalchemy
.sql
.util
.sort_tables(table_objs
)
120 # Drop all tables if requested
122 print_start('Dropping tables')
123 for table
in reversed(table_objs
):
124 table
.drop(checkfirst
=True)
127 for table
in table_objs
:
129 connection
= session
.connection()
131 # Okay, run through the tables and actually load the data now
132 for table_obj
in table_objs
:
133 table_name
= table_obj
.name
134 insert_stmt
= table_obj
.insert()
136 print_start(table_name
)
139 csvfile
= open("%s/%s.csv" %
(directory
, table_name
), 'rb')
141 # File doesn't exist; don't load anything!
142 print_done('missing?')
145 reader
= csv
.reader(csvfile
, lineterminator
='\n')
146 column_names
= [unicode(column
) for column
in reader
.next()]
148 # Self-referential tables may contain rows with foreign keys of other
149 # rows in the same table that do not yet exist. Pull these out and add
150 # them to the session last
151 # ASSUMPTION: Self-referential tables have a single PK called "id"
152 deferred_rows
= [] # ( row referring to id, [foreign ids we need] )
153 seen_ids
= {} # primary key we've seen => 1
155 # Fetch foreign key columns that point at this table, if any
156 self_ref_columns
= []
157 for column
in table_obj
.c
:
158 if any(_
.references(table_obj
) for _
in column
.foreign_keys
):
159 self_ref_columns
.append(column
)
162 def insert_and_commit():
163 session
.connection().execute(insert_stmt
, new_rows
)
170 for column_name
, value
in zip(column_names
, csvs
):
171 column
= table_obj
.c
[column_name
]
172 if column
.nullable
and value
== '':
173 # Empty string in a nullable column really means NULL
175 elif isinstance(column
.type, sqlalchemy
.types
.Boolean
):
176 # Boolean values are stored as string values 0/1, but both
177 # of those evaluate as true; SQLA wants True/False
183 # Otherwise, unflatten from bytes
184 value
= value
.decode('utf-8')
186 # nb: Dictionaries flattened with ** have to have string keys
187 row_data
[ str(column_name
) ] = value
189 # May need to stash this row and add it later if it refers to a
190 # later row in this table
192 foreign_ids
= [row_data
[_
.name
] for _
in self_ref_columns
]
193 foreign_ids
= [_
for _
in foreign_ids
if _
] # remove NULL ids
196 # NULL key. Remember this row and add as usual.
197 seen_ids
[row_data
['id']] = 1
199 elif all(_
in seen_ids
for _
in foreign_ids
):
200 # Non-NULL key we've already seen. Remember it and commit
201 # so we know the old row exists when we add the new one
203 seen_ids
[row_data
['id']] = 1
206 # Non-NULL future id. Save this and insert it later!
207 deferred_rows
.append((row_data
, foreign_ids
))
211 new_rows
.append(row_data
)
213 # Remembering some zillion rows in the session consumes a lot of
214 # RAM. Let's not do that. Commit every 1000 rows
215 if len(new_rows
) >= 1000:
220 # Attempt to add any spare rows we've collected
221 for row_data
, foreign_ids
in deferred_rows
:
222 if not all(_
in seen_ids
for _
in foreign_ids
):
223 # Could happen if row A refers to B which refers to C.
224 # This is ridiculous and doesn't happen in my data so far
225 raise ValueError("Too many levels of self-reference! "
226 "Row was: " + str(row
))
228 session
.connection().execute(
229 insert_stmt
.values(**row_data
)
231 seen_ids
[row_data
['id']] = 1
238 def dump(session
, tables
=[], directory
=None, verbose
=False):
239 """Dumps the contents of a database to a set of CSV files. Probably not
240 useful to anyone besides a developer.
243 SQLAlchemy session to use.
246 List of tables to dump. If omitted, all tables are dumped.
249 Directory the CSV files should be put in. Defaults to the `pokedex`
253 If set to True, status messages will be printed to stdout.
256 # First take care of verbosity
257 print_start
, print_done
= _get_verbose_prints(verbose
)
261 directory
= pkg_resources
.resource_filename('pokedex', 'data/csv')
264 regex
= _wildcards_to_regex(tables
)
265 table_names
= filter(regex
.match
, metadata
.tables
.keys())
267 table_names
= metadata
.tables
.keys()
272 for table_name
in table_names
:
273 print_start(table_name
)
274 table
= metadata
.tables
[table_name
]
276 writer
= csv
.writer(open("%s/%s.csv" %
(directory
, table_name
), 'wb'),
278 columns
= [col
.name
for col
in table
.columns
]
279 writer
.writerow(columns
)
281 primary_key
= table
.primary_key
282 for row
in session
.query(table
).order_by(*primary_key
).all():
285 # Convert Pythony values to something more universal
286 val
= getattr(row
, col
)
294 val
= unicode(val
).encode('utf-8')
298 writer
.writerow(csvs
)