1 """CSV to database or vice versa."""
6 from sqlalchemy
.orm
.attributes
import instrumentation_registry
7 import sqlalchemy
.types
9 from pokedex
.db
import metadata
10 import pokedex
.db
.tables
as tables
13 def _get_verbose_prints(verbose
):
14 """If `verbose` is true, returns two functions: one for printing a starting
15 message, and the other for printing a success or failure message when
18 If `verbose` is false, returns two no-op functions.
23 def print_start(thing
):
24 # Truncate to 66 characters, leaving 10 characters for a success
26 truncated_thing
= thing
[0:66]
28 # Also, space-pad to keep the cursor in a known column
29 num_spaces
= 66 - len(truncated_thing
)
31 print "%s...%s" %
(truncated_thing
, ' ' * num_spaces
),
34 def print_done(msg
='ok'):
38 return print_start
, print_done
40 # Not verbose; return dummies
41 def dummy(*args
, **kwargs
):
47 def load(session
, directory
=None, drop_tables
=False, verbose
=False):
48 """Load data from CSV files into the given database session.
50 Tables are created automatically.
53 SQLAlchemy session to use.
56 Directory the CSV files reside in. Defaults to the `pokedex` data
60 If set to True, existing `pokedex`-related tables will be dropped.
63 If set to True, status messages will be printed to stdout.
66 # First take care of verbosity
67 print_start
, print_done
= _get_verbose_prints(verbose
)
71 directory
= pkg_resources
.resource_filename('pokedex', 'data/csv')
73 # Drop all tables if requested
75 print_start('Dropping tables')
80 connection
= session
.connection()
82 # Okay, run through the tables and actually load the data now
83 for table_obj
in metadata
.sorted_tables
:
84 table_name
= table_obj
.name
85 insert_stmt
= table_obj
.insert()
87 print_start(table_name
)
90 csvfile
= open("%s/%s.csv" %
(directory
, table_name
), 'rb')
92 # File doesn't exist; don't load anything!
93 print_done('missing?')
96 reader
= csv
.reader(csvfile
, lineterminator
='\n')
97 column_names
= [unicode(column
) for column
in reader
.next()]
99 # Self-referential tables may contain rows with foreign keys of other
100 # rows in the same table that do not yet exist. Pull these out and add
101 # them to the session last
102 # ASSUMPTION: Self-referential tables have a single PK called "id"
103 deferred_rows
= [] # ( row referring to id, [foreign ids we need] )
104 seen_ids
= {} # primary key we've seen => 1
106 # Fetch foreign key columns that point at this table, if any
107 self_ref_columns
= []
108 for column
in table_obj
.c
:
109 if any(_
.references(table_obj
) for _
in column
.foreign_keys
):
110 self_ref_columns
.append(column
)
113 def insert_and_commit():
114 session
.connection().execute(insert_stmt
, new_rows
)
121 for column_name
, value
in zip(column_names
, csvs
):
122 column
= table_obj
.c
[column_name
]
123 if column
.nullable
and value
== '':
124 # Empty string in a nullable column really means NULL
126 elif isinstance(column
.type, sqlalchemy
.types
.Boolean
):
127 # Boolean values are stored as string values 0/1, but both
128 # of those evaluate as true; SQLA wants True/False
134 # Otherwise, unflatten from bytes
135 value
= value
.decode('utf-8')
137 # nb: Dictionaries flattened with ** have to have string keys
138 row_data
[ str(column_name
) ] = value
140 # May need to stash this row and add it later if it refers to a
141 # later row in this table
143 foreign_ids
= [row_data
[_
.name
] for _
in self_ref_columns
]
144 foreign_ids
= [_
for _
in foreign_ids
if _
] # remove NULL ids
147 # NULL key. Remember this row and add as usual.
148 seen_ids
[row_data
['id']] = 1
150 elif all(_
in seen_ids
for _
in foreign_ids
):
151 # Non-NULL key we've already seen. Remember it and commit
152 # so we know the old row exists when we add the new one
154 seen_ids
[row_data
['id']] = 1
157 # Non-NULL future id. Save this and insert it later!
158 deferred_rows
.append((row_data
, foreign_ids
))
162 new_rows
.append(row_data
)
164 # Remembering some zillion rows in the session consumes a lot of
165 # RAM. Let's not do that. Commit every 1000 rows
166 if len(new_rows
) > 1000:
171 # Attempt to add any spare rows we've collected
172 for row_data
, foreign_ids
in deferred_rows
:
173 if not all(_
in seen_ids
for _
in foreign_ids
):
174 # Could happen if row A refers to B which refers to C.
175 # This is ridiculous and doesn't happen in my data so far
176 raise ValueError("Too many levels of self-reference! "
177 "Row was: " + str(row
))
179 session
.connection().execute(
180 insert_stmt
.values(**row_data
)
182 seen_ids
[row_data
['id']] = 1
189 def dump(session
, directory
=None, verbose
=False):
190 """Dumps the contents of a database to a set of CSV files. Probably not
191 useful to anyone besides a developer.
194 SQLAlchemy session to use.
197 Directory the CSV files should be put in. Defaults to the `pokedex`
201 If set to True, status messages will be printed to stdout.
204 # First take care of verbosity
205 print_start
, print_done
= _get_verbose_prints(verbose
)
209 directory
= pkg_resources
.resource_filename('pokedex', 'data/csv')
211 for table_name
in sorted(metadata
.tables
.keys()):
212 print_start(table_name
)
213 table
= metadata
.tables
[table_name
]
215 writer
= csv
.writer(open("%s/%s.csv" %
(directory
, table_name
), 'wb'),
217 columns
= [col
.name
for col
in table
.columns
]
218 writer
.writerow(columns
)
220 primary_key
= table
.primary_key
221 for row
in session
.query(table
).order_by(*primary_key
).all():
224 # Convert Pythony values to something more universal
225 val
= getattr(row
, col
)
233 val
= unicode(val
).encode('utf-8')
237 writer
.writerow(csvs
)