diff --git a/newcrawler/rev.py b/newcrawler/rev.py index 9d5902a..0b8d7b8 100644 --- a/newcrawler/rev.py +++ b/newcrawler/rev.py @@ -1,133 +1,159 @@ -import string, random +# Problems: +# - How to handle list diffs? Can't just replace, would still lose data.. +# - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps? +# -> Would still need to pre-process dicts and lists before storage, and compare them... + +# Ideas: +# - Download PDF/EPUB headers and extract metadata from there -def random_id(): - return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14)) +import string, random, copy +from collections import OrderedDict +from difflib import SequenceMatcher class RevisionedDict(object): - def __init__(self, parent=None): - self.latest_revision = "" - self.parent = parent - self.revisions = {} - self.objects = {} + def __init__(self, data): + self._revisions = OrderedDict({}) + self._applied_diffs = [] + self._add_revision(data) - def __eq__(self, other): - # This is a tricky one... we need to compare this RevisionedDict against the other thing - which is almost certainly a dict. - # We'll just compare keys and values. + def _generate_revision_id(self): + return "".join(random.choice(string.lowercase + string.uppercase + string.digits) for x in xrange(0, 14)) + + def _add_revision(self, data): + new_revision_id = self._generate_revision_id() + self._revisions[new_revision_id] = copy.deepcopy(data) + self._last_revision = new_revision_id + return new_revision_id + + def _get_last_revision(self): # Always returns a copy! + base_revision = copy.deepcopy(self._revisions[self._last_revision]) + base_revision["_rev"] = self._last_revision # This is to be able to identify the source revision for a modified serialized object later + return base_revision + + def _apply_diff(self, diff): + new_data = diff.apply(self._get_last_revision(), self._diffs_since(diff.origin_revision)) + new_revision_id = self._add_revision(new_data) + self._applied_diffs.append((new_revision_id, diff)) + + def _diffs_since(self, revision_id): try: - if set(self.keys()) != set(other.keys()): - return False - except AttributeError, e: - return False # Not a dict(-like) + revision_index = next(x for x in enumerate(self._applied_diffs) if x[1][0] == revision_id) + return [x[1] for x in self._applied_diffs[revision_index[0] + 1:]] + except StopIteration, e: + return [x[1] for x in self._applied_diffs] + + def update(self, data): + diff = self.autodiff(data) + self._apply_diff(diff) - latest_rev = self._get_latest_revision() - for key, value in other.iteritems(): - if self.objects[latest_rev[key]] != value: - return False + def diff(self, data, origin_revision): + # Figure out if any revisions happened in the meantime + return RevisionedDictDiff(data, self._revisions[origin_revision], origin_revision) + + def autodiff(self, data): + # Takes the revision number from the data + return self.diff(data, data["_rev"]) + +class RevisionedDictDiff(object): + def __init__(self, data, origin_data, origin_revision): + self.origin_revision = origin_revision + self._diff_data = self._diff_structure(data, origin_data) + + def _diff_structure(self, structure, origin_structure, structure_key=None): + if isinstance(structure, dict): + if isinstance(origin_structure, dict): + # Compare dicts + opcodes = [] - return True - - def __len__(self): - return len(self._get_latest_revision()) - - def __getitem__(self, key): - return self.objects[self._get_latest_revision()[key]] - - def __setitem__(self, key, value): - obj = self._dump_latest_revision() - obj[key] = value - self.update(obj) - - def __delitem__(self, key): - obj = self._dump_latest_revision() - del obj[key] - self.update(obj) - - def __contains__(self, key): - return (key in self._get_latest_revision()) - - def keys(self): - return self._get_latest_revision().keys() - - def values(self): - return [self.objects[id_] for id_ in self._get_latest_revision().values()] - - def items(self): - return [(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()] - - # The below are awful... this really isn't how iterators are supposed to work - - def iterkeys(self): - return iter(self._get_latest_revision().keys()) - - def itervalues(self): - return iter([self.objects[id_] for id_ in self._get_latest_revision().values()]) - - def iteritems(self): - return iter([(key, self.objects[id_]) for key, id_ in self._get_latest_revision().items()]) - - # TODO: __iter__, __reversed__ - - def _add_revision(data): - object_map = {} - latest_rev = self._get_latest_revision() - anything_changed = False - - for key in data.keys(): - try: - try: - is_dict = isinstance(self.objects[latest_rev[key]][0], RevisionedDict) - except IndexError, e: - is_dict = False + removed_keys = set(origin_structure.keys()) - set(structure.keys()) + + for key in removed_keys: + opcodes.append(("delete", key)) + + new_keys = set(structure.keys()) - set(origin_structure.keys()) + + for key in new_keys: + if key != "_rev": # Ignore added _rev key + opcodes.append(("insert", key, structure[key])) - if is_dict: - unchanged = self.objects[latest_rev[key]][0] == data[key]: - else: - unchanged = self.objects[latest_rev[key]] == data[key]: - except KeyError, e: - # Doesn't exist in last rev, new key - unchanged = False + for key, value in structure.iteritems(): + if key not in new_keys: + if value == origin_structure[key]: + opcodes.append(("equal", key)) + else: + if isinstance(value, dict) or isinstance(value, list): + opcodes.append(self._diff_structure(value, origin_structure[key], structure_key=key)) + else: + opcodes.append(("replace", key, value)) - if unchanged: - # Leave as it is - object_map[key] = latest_rev[key] + return ("dict", structure_key, opcodes) else: - # New data! - if isinstance(data[key], dict): # dict, just need to update values - new_sub_rev = self.objects[latest_rev[key]].update(data[key]) - self.objects[new_id] = (self.objects[latest_rev[key]], new_sub_rev) - else: - new_id = random_id() - self.objects[new_id] = data[key] - object_map[key] = new_id - anything_changed = True - - if anything_changed: - new_rev = random_id() - self.revisions[new_rev] = (self.latest_revision, object_map) # (parent revision, new object map) - return new_rev + return ("replace", structure) + elif isinstance(structure, list): + if isinstance(origin_structure, list): + # Compare lists (does NOT support nested dictionaries yet!) + return ("list", structure_key, SequenceMatcher(a=origin_structure, b=structure, autojunk=False).get_opcodes()) + else: + return ("replace", structure) else: - return latest_rev - - def _get_latest_revision(): - return self.revisions[self.latest_revision] - - def _dump_latest_revision(): - obj = {} - for key, id_ in self._get_latest_revision().iteritems(): - obj[key] = self.objects[id_] - return obj - - def update(data): - rev_id = self._add_revision(data) - self.latest_revision = rev_id - return rev_id - - # TODO: compare! + return ("replace", structure) + + def _apply_structure(self, structure, diff_data, intermediate_diffs): + pass + # for every key + # if list + # calculate_offsets (TODO) + # apply structure + # if dict + # apply structure + # else + # apply diff data + # return key + + def apply(data, intermediate_diffs=[]): + # This will apply the diff against the specified source data + data = copy.deepcopy(data) + self._apply_structure(data, self._diff_data, intermediate_diffs) + + + +origin = { + "type": "message", + "data": { + "title": "Sample title", + "author": "Sample author", + "isbn": ["a0", "a1", "a2", "a3"], + "description": ["test one", "test two"], + "eq": ["a", "b", "c"] + } +} -# Problems: -# - How to handle list diffs? Can't just replace, would still lose data.. -# - Over-engineering? Python already interns primitives, so no point in storing object references rather than just direct revision maps? -# -> Would still need to pre-process dicts and lists before storage, and compare them... +""" +revdict = RevisionedDict(origin) +origin = revdict._get_last_revision() -# Ideas: -# - Download PDF/EPUB headers and extract metadata from there +origin["herp"] = "derp" +origin["data"]["isbn"].remove("a2") +origin["data"]["isbn"].insert(0, "a4") +origin["data"]["author"] = "Other author" + +#import json +#print json.dumps(revdict.autodiff(origin)._diff_data, indent=4) + +revdict.update(origin) + +""" + +revdict = RevisionedDict(origin) + +for i in xrange(0, 5): + x = revdict._add_revision("blah") + revdict._applied_diffs.append((x, i)) + +base_rev = revdict._last_revision + +for i in xrange(5, 10): + x = revdict._add_revision("blah") + revdict._applied_diffs.append((x, i)) + +print revdict._diffs_since(base_rev)