In the process of redoing RevisionedDict

feature/insanity
Sven Slootweg 11 years ago
parent 4927b5e7a3
commit 1f32cac481

@ -1,133 +1,159 @@
import string, random # Problems:
# - How to handle list diffs? Can't just replace, would still lose data..
# - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps?
# -> Would still need to pre-process dicts and lists before storage, and compare them...
# Ideas:
# - Download PDF/EPUB headers and extract metadata from there
def random_id(): import string, random, copy
return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14)) from collections import OrderedDict
from difflib import SequenceMatcher
class RevisionedDict(object): class RevisionedDict(object):
def __init__(self, parent=None): def __init__(self, data):
self.latest_revision = "" self._revisions = OrderedDict({})
self.parent = parent self._applied_diffs = []
self.revisions = {} self._add_revision(data)
self.objects = {}
def _generate_revision_id(self):
def __eq__(self, other): return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14))
# This is a tricky one... we need to compare this RevisionedDict against the other thing - which is almost certainly a dict.
# We'll just compare keys and values. def _add_revision(self, data):
new_revision_id = self._generate_revision_id()
self._revisions[new_revision_id] = copy.deepcopy(data)
self._last_revision = new_revision_id
return new_revision_id
def _get_last_revision(self): # Always returns a copy!
base_revision = copy.deepcopy(self._revisions[self._last_revision])
base_revision["_rev"] = self._last_revision # This is to be able to identify the source revision for a modified serialized object later
return base_revision
def _apply_diff(self, diff):
new_data = diff.apply(self._get_last_revision(), self._diffs_since(diff.origin_revision))
new_revision_id = self._add_revision(new_data)
self._applied_diffs.append((new_revision_id, diff))
def _diffs_since(self, revision_id):
try: try:
if set(self.keys()) != set(other.keys()): revision_index = next(x for x in enumerate(self._applied_diffs) if x[1][0] == revision_id)
return False return [x[1] for x in self._applied_diffs[revision_index[0] + 1:]]
except AttributeError, e: except StopIteration, e:
return False # Not a dict(-like) return [x[1] for x in self._applied_diffs]
latest_rev = self._get_latest_revision() def update(self, data):
for key, value in other.iteritems(): diff = self.autodiff(data)
if self.objects[latest_rev[key]] != value: self._apply_diff(diff)
return False
def diff(self, data, origin_revision):
return True # Figure out if any revisions happened in the meantime
return RevisionedDictDiff(data, self._revisions[origin_revision], origin_revision)
def __len__(self):
return len(self._get_latest_revision()) def autodiff(self, data):
# Takes the revision number from the data
def __getitem__(self, key): return self.diff(data, data["_rev"])
return self.objects[self._get_latest_revision()[key]]
class RevisionedDictDiff(object):
def __setitem__(self, key, value): def __init__(self, data, origin_data, origin_revision):
obj = self._dump_latest_revision() self.origin_revision = origin_revision
obj[key] = value self._diff_data = self._diff_structure(data, origin_data)
self.update(obj)
def _diff_structure(self, structure, origin_structure, structure_key=None):
def __delitem__(self, key): if isinstance(structure, dict):
obj = self._dump_latest_revision() if isinstance(origin_structure, dict):
del obj[key] # Compare dicts
self.update(obj) opcodes = []
def __contains__(self, key): removed_keys = set(origin_structure.keys()) - set(structure.keys())
return (key in self._get_latest_revision())
for key in removed_keys:
def keys(self): opcodes.append(("delete", key))
return self._get_latest_revision().keys()
new_keys = set(structure.keys()) - set(origin_structure.keys())
def values(self):
return [self.objects[id_] for id_ in self._get_latest_revision().values()] for key in new_keys:
if key != "_rev": # Ignore added _rev key
def items(self): opcodes.append(("insert", key, structure[key]))
return [(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()]
for key, value in structure.iteritems():
if key not in new_keys:
if value == origin_structure[key]:
opcodes.append(("equal", key))
else:
if isinstance(value, dict) or isinstance(value, list):
opcodes.append(self._diff_structure(value, origin_structure[key], structure_key=key))
else:
opcodes.append(("replace", key, value))
return ("dict", structure_key, opcodes)
else:
return ("replace", structure)
elif isinstance(structure, list):
if isinstance(origin_structure, list):
# Compare lists (does NOT support nested dictionaries yet!)
return ("list", structure_key, SequenceMatcher(a=origin_structure, b=structure, autojunk=False).get_opcodes())
else:
return ("replace", structure)
else:
return ("replace", structure)
# The below are awful... this really isn't how iterators are supposed to work def _apply_structure(self, structure, diff_data, intermediate_diffs):
pass
# for every key
# if list
# calculate_offsets (TODO)
# apply structure
# if dict
# apply structure
# else
# apply diff data
# return key
def iterkeys(self): def apply(data, intermediate_diffs=[]):
return iter(self._get_latest_revision().keys()) # This will apply the diff against the specified source data
data = copy.deepcopy(data)
self._apply_structure(data, self._diff_data, intermediate_diffs)
def itervalues(self):
return iter([self.objects[id_] for id_ in self._get_latest_revision().values()])
def iteritems(self):
return iter([(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()])
# TODO: __iter__, __reversed__ origin = {
"type": "message",
"data": {
"title": "Sample title",
"author": "Sample author",
"isbn": ["a0", "a1", "a2", "a3"],
"description": ["test one", "test two"],
"eq": ["a", "b", "c"]
}
}
def _add_revision(data): """
object_map = {} revdict = RevisionedDict(origin)
latest_rev = self._get_latest_revision() origin = revdict._get_last_revision()
anything_changed = False
for key in data.keys(): origin["herp"] = "derp"
try: origin["data"]["isbn"].remove("a2")
try: origin["data"]["isbn"].insert(0, "a4")
is_dict = isinstance(self.objects[latest_rev[key]][0], RevisionedDict) origin["data"]["author"] = "Other author"
except IndexError, e:
is_dict = False
if is_dict: #import json
unchanged = self.objects[latest_rev[key]][0] == data[key]: #print json.dumps(revdict.autodiff(origin)._diff_data, indent=4)
else:
unchanged = self.objects[latest_rev[key]] == data[key]:
except KeyError, e:
# Doesn't exist in last rev, new key
unchanged = False
if unchanged: revdict.update(origin)
# Leave as it is
object_map[key] = latest_rev[key]
else:
# New data!
if isinstance(data[key], dict): # dict, just need to update values
new_sub_rev = self.objects[latest_rev[key]].update(data[key])
self.objects[new_id] = (self.objects[latest_rev[key]], new_sub_rev)
else:
new_id = random_id()
self.objects[new_id] = data[key]
object_map[key] = new_id
anything_changed = True
if anything_changed:
new_rev = random_id()
self.revisions[new_rev] = (self.latest_revision, object_map) # (parent revision, new object map)
return new_rev
else:
return latest_rev
def _get_latest_revision(): """
return self.revisions[self.latest_revision]
def _dump_latest_revision(): revdict = RevisionedDict(origin)
obj = {}
for key, id_ in self._get_latest_revision().iteritems():
obj[key] = self.objects[id_]
return obj
def update(data): for i in xrange(0, 5):
rev_id = self._add_revision(data) x = revdict._add_revision("blah")
self.latest_revision = rev_id revdict._applied_diffs.append((x, i))
return rev_id
# TODO: compare! base_rev = revdict._last_revision
# Problems: for i in xrange(5, 10):
# - How to handle list diffs? Can't just replace, would still lose data.. x = revdict._add_revision("blah")
# - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps? revdict._applied_diffs.append((x, i))
# -> Would still need to pre-process dicts and lists before storage, and compare them...
# Ideas: print revdict._diffs_since(base_rev)
# - Download PDF/EPUB headers and extract metadata from there

Loading…
Cancel
Save