1 """CSV to database or vice versa."""
6 from sqlalchemy
.orm
.attributes
import instrumentation_registry
7 import sqlalchemy
.types
9 from pokedex
.db
import metadata
10 import pokedex
.db
.tables
as tables
13 def _get_verbose_prints(verbose
):
14 """If `verbose` is true, returns two functions: one for printing a starting
15 message, and the other for printing a success or failure message when
18 If `verbose` is false, returns two no-op functions.
23 def print_start(thing
):
24 # Truncate to 66 characters, leaving 10 characters for a success
26 truncated_thing
= thing
[0:66]
28 # Also, space-pad to keep the cursor in a known column
29 num_spaces
= 66 - len(truncated_thing
)
31 print "%s...%s" %
(truncated_thing
, ' ' * num_spaces
),
34 def print_done(msg
='ok'):
38 return print_start
, print_done
40 # Not verbose; return dummies
41 def dummy(*args
, **kwargs
):
47 def load(session
, directory
=None, drop_tables
=False, verbose
=False):
48 """Load data from CSV files into the given database session.
50 Tables are created automatically.
53 SQLAlchemy session to use.
56 Directory the CSV files reside in. Defaults to the `pokedex` data
60 If set to True, existing `pokedex`-related tables will be dropped.
63 If set to True, status messages will be printed to stdout.
66 # First take care of verbosity
67 print_start
, print_done
= _get_verbose_prints(verbose
)
71 directory
= pkg_resources
.resource_filename('pokedex', 'data/csv')
73 # Drop all tables if requested
75 print_start('Dropping tables')
81 # SQLAlchemy is retarded and there is no way for me to get a list of ORM
82 # classes besides to inspect the module they all happen to live in for
83 # things that look right.
84 table_base
= tables
.TableBase
85 orm_classes
= {} # table object => table class
87 for name
in dir(tables
):
88 # dir() returns strings! How /convenient/.
89 thingy
= getattr(tables
, name
)
91 if not isinstance(thingy
, type):
94 elif not issubclass(thingy
, table_base
):
95 # Not a declarative table; bail
97 elif thingy
== table_base
:
98 # Declarative table base, so not a real table; bail
101 # thingy is definitely a table class! Hallelujah.
102 orm_classes
[thingy
.__table__
] = thingy
104 # Okay, run through the tables and actually load the data now
105 for table_obj
in metadata
.sorted_tables
:
106 table_class
= orm_classes
[table_obj
]
107 table_name
= table_obj
.name
109 print_start(table_name
)
112 csvfile
= open("%s/%s.csv" %
(directory
, table_name
), 'rb')
114 # File doesn't exist; don't load anything!
115 print_done('missing?')
118 reader
= csv
.reader(csvfile
, lineterminator
='\n')
119 column_names
= [unicode(column
) for column
in reader
.next()]
121 # Self-referential tables may contain rows with foreign keys of other
122 # rows in the same table that do not yet exist. Pull these out and add
123 # them to the session last
124 # ASSUMPTION: Self-referential tables have a single PK called "id"
125 deferred_rows
= [] # ( row referring to id, [foreign ids we need] )
126 seen_ids
= {} # primary key we've seen => 1
128 # Fetch foreign key columns that point at this table, if any
129 self_ref_columns
= []
130 for column
in table_obj
.c
:
131 if any(_
.references(table_obj
) for _
in column
.foreign_keys
):
132 self_ref_columns
.append(column
)
137 for column_name
, value
in zip(column_names
, csvs
):
138 column
= table_obj
.c
[column_name
]
139 if column
.nullable
and value
== '':
140 # Empty string in a nullable column really means NULL
142 elif isinstance(column
.type, sqlalchemy
.types
.Boolean
):
143 # Boolean values are stored as string values 0/1, but both
144 # of those evaluate as true; SQLA wants True/False
150 # Otherwise, unflatten from bytes
151 value
= value
.decode('utf-8')
153 setattr(row
, column_name
, value
)
155 # May need to stash this row and add it later if it refers to a
156 # later row in this table
158 foreign_ids
= [getattr(row
, _
.name
) for _
in self_ref_columns
]
159 foreign_ids
= [_
for _
in foreign_ids
if _
] # remove NULL ids
162 # NULL key. Remember this row and add as usual.
165 elif all(_
in seen_ids
for _
in foreign_ids
):
166 # Non-NULL key we've already seen. Remember it and commit
167 # so we know the old row exists when we add the new one
172 # Non-NULL future id. Save this and insert it later!
173 deferred_rows
.append((row
, foreign_ids
))
180 # Attempt to add any spare rows we've collected
181 for row
, foreign_ids
in deferred_rows
:
182 if not all(_
in seen_ids
for _
in foreign_ids
):
183 # Could happen if row A refers to B which refers to C.
184 # This is ridiculous and doesn't happen in my data so far
185 raise ValueError("Too many levels of self-reference! "
186 "Row was: " + str(row
.__dict__
))
196 def dump(session
, directory
=None, verbose
=False):
197 """Dumps the contents of a database to a set of CSV files. Probably not
198 useful to anyone besides a developer.
201 SQLAlchemy session to use.
204 Directory the CSV files should be put in. Defaults to the `pokedex`
208 If set to True, status messages will be printed to stdout.
211 # First take care of verbosity
212 print_start
, print_done
= _get_verbose_prints(verbose
)
216 directory
= pkg_resources
.resource_filename('pokedex', 'data/csv')
218 for table_name
in sorted(metadata
.tables
.keys()):
219 print_start(table_name
)
220 table
= metadata
.tables
[table_name
]
222 writer
= csv
.writer(open("%s/%s.csv" %
(directory
, table_name
), 'wb'),
224 columns
= [col
.name
for col
in table
.columns
]
225 writer
.writerow(columns
)
227 primary_key
= table
.primary_key
228 for row
in session
.query(table
).order_by(*primary_key
).all():
231 # Convert Pythony values to something more universal
232 val
= getattr(row
, col
)
240 val
= unicode(val
).encode('utf-8')
244 writer
.writerow(csvs
)