dictionary - 两个Python字典之间的快速比较

假设我有两个python字典- dictAdictB ,我需要找出在dictB中存在但是在dictA中没有的键,最快的方法是什么?

我是不是应该把字典键转换成一个集合,然后继续?


说明


dictA={'key1':a, 'key2':b, 'key3':{'key11':cc, 'key12':dd}, 'key4':{'key111':{....}}}
dictB={'key1':a, 'key2:':newb, 'key3':{'key11':cc, 'key12':newdd, 'key13':ee}.......

时间:

你可以对键值使用set操作:


diff = set(dictb.keys()) - set(dicta.keys())

下面是一个查找所有可能性的类: 添加的内容,删除的内容,哪个key-value对是相同的,哪些key-value对被更改。


class DictDiffer(object):
"""
 Calculate the difference between two dictionaries as:
 (1) items added
 (2) items removed
 (3) keys same in both but changed values
 (4) keys same in both and unchanged values
"""
 def __init__(self, current_dict, past_dict):
 self.current_dict, self.past_dict = current_dict, past_dict
 self.set_current, self.set_past = set(current_dict.keys()), set(past_dict.keys())
 self.intersect = self.set_current.intersection(self.set_past)
 def added(self):
 return self.set_current - self.intersect 
 def removed(self):
 return self.set_past - self.intersect 
 def changed(self):
 return set(o for o in self.intersect if self.past_dict[o] != self.current_dict[o])
 def unchanged(self):
 return set(o for o in self.intersect if self.past_dict[o] == self.current_dict[o])

下面是一些例子输出:


>>> a = {'a': 1, 'b': 1, 'c': 0}
>>> b = {'a': 1, 'b': 2, 'd': 0}
>>> d = DictDiffer(b, a)
>>> print"Added:", d.added()
Added: set(['d'])
>>> print"Removed:", d.removed()
Removed: set(['c'])
>>> print"Changed:", d.changed()
Changed: set(['b'])
>>> print"Unchanged:", d.unchanged()
Unchanged: set(['a'])

https://github.com/hughdbrown/dictdiffer

如果你只是想检查B中的键是否不在A中,any(True for k in dictB if k not in dictA)

要查找缺少的键:


diff = set(dictB)-set(dictA) #sets

C:Dokumente und Einstellungenthc>python -m timeit -s"dictA = 
dict(zip(range(1000),range
(1000))); dictB = dict(zip(range(0,2000,2),range(1000)))""diff=set(dictB)-set(dictA)"
10000 loops, best of 3: 107 usec per loop

diff = [ k for k in dictB if k not in dictA ] #lc

C:Dokumente und Einstellungenthc>python -m timeit -s"dictA = 
dict(zip(range(1000),range
(1000))); dictB = dict(zip(range(0,2000,2),range(1000)))""diff=[ k for k in dictB if
k not in dictA ]"
10000 loops, best of 3: 95.9 usec per loop

所以这两个解决方案几乎是相同的速度。


dicta = {"a":1,"b":2,"c":3,"d":4}
dictb = {"a":1,"d":2}
for key in dicta.keys():
 if not key in dictb:
 print key

如果你指的是(你只需要找出",那么有键"在B中而不是在A中,而不是哪些可能是),最快的方法就是:


if any(True for k in dictB if k not in dictA): ...

还有一个关于这个参数的问题,我必须承认有一个简单的解决方案:datadiff库 python 帮助打印两个字典之间的差异。

如果python ≥ 2.7 :


# update different values in dictB
# I would assume only dictA should be updated,
# but the question specifies otherwise

for k in dictA.viewkeys() & dictB.viewkeys():
 if dictA[k] != dictB[k]:
 dictB[k]= dictA[k]

# add missing keys to dictA

dictA.update( (k,dictB[k]) for k in dictB.viewkeys() - dictA.viewkeys() )

下面是一种方法,允许键值计算到False,然后使用生成器表达式尽可能早地退出,虽然不是很漂亮。


any(map(lambda x: True, (k for k in b if k not in a)))

以下是执行上述操作的更好,更漂亮的方法:


any(True for k in b if k not in a)

PyDev- > new PyDev Module- > Module : ⇩unittest


import unittest


class Test(unittest.TestCase):


 def testName(self):
 obj1 = {1:1, 2:2}
 obj2 = {1:1, 2:2}
 self.maxDiff = None # sometimes is usefull
 self.assertDictEqual(d1, d2)

if __name__ =="__main__":
 #import sys;sys.argv = ['', 'Test.testName']

 unittest.main()

这个问题中的答案帮助我解决了以下问题:

  1. 记录两个字典之间的差异
  2. 合并#1到基础词典的差异
  3. 合并两个字典之间的差异(将字典#2视为差异字典)
  4. 尝试检测物品的移动以及变化
  5. 递归执行所有这些操作

所有这些结合JSON使得配置存储支持变得非常强大。

解决方案(也在github上 ):


from collections import OrderedDict
from pprint import pprint


class izipDestinationMatching(object):
 __slots__ = ("attr","value","index")

 def __init__(self, attr, value, index):
 self.attr, self.value, self.index = attr, value, index

 def __repr__(self):
 return"izip_destination_matching: found match by '%s' = '%s' @ %d" % (self.attr, self.value, self.index)


def izip_destination(a, b, attrs, addMarker=True):
"""
 Returns zipped lists, but final size is equal to b with (if shorter) a padded with nulls
 Additionally also tries to find item reallocations by searching child dicts (if they are dicts) for attribute, listed in attrs)
 When addMarker == False (patching), final size will be the longer of a, b
"""
 for idx, item in enumerate(b):
 try:
 attr = next((x for x in attrs if x in item), None) # See if the item has any of the ID attributes
 match, matchIdx = next(((orgItm, idx) for idx, orgItm in enumerate(a) if attr in orgItm and orgItm[attr] == item[attr]), (None, None)) if attr else (None, None)
 if match and matchIdx != idx and addMarker: item[izipDestinationMatching] = izipDestinationMatching(attr, item[attr], matchIdx)
 except:
 match = None
 yield (match if match else a[idx] if len(a) > idx else None), item
 if not addMarker and len(a) > len(b):
 for item in a[len(b) - len(a):]:
 yield item, item


def dictdiff(a, b, searchAttrs=[]):
"""
 returns a dictionary which represents difference from a to b
 the return dict is as short as possible:
 equal items are removed
 added / changed items are listed
 removed items are listed with value=None
 Also processes list values where the resulting list size will match that of b.
 It can also search said list items (that are dicts) for identity values to detect changed positions.
 In case such identity value is found, it is kept so that it can be re-found during the merge phase
 @param a: original dict
 @param b: new dict
 @param searchAttrs: list of strings (keys to search for in sub-dicts)
 @return: dict / list / whatever input is
"""
 if not (isinstance(a, dict) and isinstance(b, dict)):
 if isinstance(a, list) and isinstance(b, list):
 return [dictdiff(v1, v2, searchAttrs) for v1, v2 in izip_destination(a, b, searchAttrs)]
 return b
 res = OrderedDict()
 if izipDestinationMatching in b:
 keepKey = b[izipDestinationMatching].attr
 del b[izipDestinationMatching]
 else:
 keepKey = izipDestinationMatching
 for key in sorted(set(a.keys() + b.keys())):
 v1 = a.get(key, None)
 v2 = b.get(key, None)
 if keepKey == key or v1 != v2: res[key] = dictdiff(v1, v2, searchAttrs)
 if len(res) <= 1: res = dict(res) # This is only here for pretty print (OrderedDict doesn't pprint nicely)
 return res


def dictmerge(a, b, searchAttrs=[]):
"""
 Returns a dictionary which merges differences recorded in b to base dictionary a
 Also processes list values where the resulting list size will match that of a
 It can also search said list items (that are dicts) for identity values to detect changed positions
 @param a: original dict
 @param b: diff dict to patch into a
 @param searchAttrs: list of strings (keys to search for in sub-dicts)
 @return: dict / list / whatever input is
"""
 if not (isinstance(a, dict) and isinstance(b, dict)):
 if isinstance(a, list) and isinstance(b, list):
 return [dictmerge(v1, v2, searchAttrs) for v1, v2 in izip_destination(a, b, searchAttrs, False)]
 return b
 res = OrderedDict()
 for key in sorted(set(a.keys() + b.keys())):
 v1 = a.get(key, None)
 v2 = b.get(key, None)
 #print"processing", key, v1, v2, key not in b, dictmerge(v1, v2)
 if v2 is not None: res[key] = dictmerge(v1, v2, searchAttrs)
 elif key not in b: res[key] = v1
 if len(res) <= 1: res = dict(res) # This is only here for pretty print (OrderedDict doesn't pprint nicely)
 return res

如果你希望递归地进行差异,我已经编写了python软件包: https://github.com/erasmose/deepdiff

安装

从PyPi安装:

 
pip install deepdiff

 

如果你是Python3,则还需要安装:


pip install future six

用法举例


>>> from deepdiff import DeepDiff
>>> from pprint import pprint
>>> from __future__ import print_function

相同对象返回空


>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = t1
>>> ddiff = DeepDiff(t1, t2)
>>> print (ddiff.changes)
 {}

项的类型已更改


>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = {1:1, 2:"2", 3:3}
>>> ddiff = DeepDiff(t1, t2)
>>> print (ddiff.changes)
 {'type_changes': ["root[2]: 2=<type 'int'> vs. 2=<type 'str'>"]}

项的值已更改


>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = {1:1, 2:4, 3:3}
>>> ddiff = DeepDiff(t1, t2)
>>> print (ddiff.changes)
 {'values_changed': ['root[2]: 2 ====>> 4']}

已添加和/或删除项


>>> t1 = {1:1, 2:2, 3:3, 4:4}
>>> t2 = {1:1, 2:4, 3:3, 5:5, 6:6}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff.changes)
 {'dic_item_added': ['root[5, 6]'],
 'dic_item_removed': ['root[4]'],
 'values_changed': ['root[2]: 2 ====>> 4']}

字符串差异


>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":"world"}}
>>> t2 = {1:1, 2:4, 3:3, 4:{"a":"hello","b":"world!"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff.changes, indent = 2)
 { 'values_changed': [ 'root[2]: 2 ====>> 4',
"root[4]['b']:n--- n+++ n@@ -1 +1 @@n-worldn+world!"]}
>>>
>>> print (ddiff.changes['values_changed'][1])
 root[4]['b']:
 --- 
 +++ 
 @@ -1 +1 @@
 -world
 +world!

String 差异2


>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":"world!nGoodbye!n1n2nEnd"}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":"worldn1n2nEnd"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff.changes, indent = 2)
 { 'values_changed': ["root[4]['b']:n--- n+++ n@@ -1,5 +1,4 @@n-world!n-Goodbye!n+worldn 1n 2n End"]}
>>>
>>> print (ddiff.changes['values_changed'][0])
 root[4]['b']:
 --- 
 +++ 
 @@ -1,5 +1,4 @@
 -world!
 -Goodbye!
 +world
 1
 2
 End

类型更改


>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":"worldnnnEnd"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff.changes, indent = 2)
 { 'type_changes': ["root[4]['b']: [1, 2, 3]=<type 'list'> vs. worldnnnEnd=<type 'str'>"]}

列表差异


>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":[1, 2]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff.changes, indent = 2)
 { 'list_removed': ["root[4]['b']: [3]"]}

列表差异2:请注意,它没有考虑顺序


>>> # Note that it DOES NOT take order into account
... t1 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":[1, 3, 2]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff.changes, indent = 2)
 { }

包含字典的列表:


>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":[1, 2, {1:1, 2:2}]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello","b":[1, 2, {1:3}]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff.changes, indent = 2)
 { 'dic_item_removed': ["root[4]['b'][2][2]"],
 'values_changed': ["root[4]['b'][2][1]: 1 ====>> 3"]}

...