defminDistance(word1:str, word2:str):'编辑距离的计算函数'
n =len(word1)
m =len(word2)# 有一个字符串为空串if n * m ==0:return n + m
# DP 数组
D =[[0]*(m +1)for _ inrange(n +1)]# 边界状态初始化for i inrange(n +1):
D[i][0]= i
for j inrange(m +1):
D[0][j]= j
# 计算所有 DP 值for i inrange(1, n +1):for j inrange(1, m +1):
left = D[i -1][j]+1
down = D[i][j -1]+1
left_down = D[i -1][j -1]if word1[i -1]!= word2[j -1]:
left_down +=1
D[i][j]=min(left, down, left_down)return D[n][m]
result =[]for name in find.name.values:
a = data.user.apply(lambda user: minDistance(user, name))
user = data.user[a.argmin()]
result.append(user)
find["result"]= result
find
测试后发现部分地址的效果不佳。
我们任取2个结果为信阳息县淮河路店地址看看编辑距离最小的前10个地址和编辑距离:
a = data.user.apply(lambda user: minDistance(user,'河南美锐信阳息县淮河路分店'))
a = a.nsmallest(10).reset_index()
a.columns =["名称","编辑距离"]
a.名称 = data.user[a.名称].values
a
a = data.user.apply(lambda user: minDistance(user,'河南美锐信阳潢川四中分店'))
a = a.nsmallest(10).reset_index()
a.columns =["名称","编辑距离"]
a.名称 = data.user[a.名称].values
a
a = data.user.apply(lambda user: fuzz.ratio(user,'河南美锐信阳潢川四中分店'))
a = a.nlargest(10).reset_index()
a.columns =["名称","相似度"]
a.名称 = data.user[a.名称].values
a
非完全匹配(Partial Ratio):
a = data.user.apply(lambda user: fuzz.partial_ratio(user,'河南美锐信阳潢川四中分店'))
a = a.nlargest(10).reset_index()
a.columns =["名称","相似度"]
a.名称 = data.user[a.名称].values
a
显然fuzzywuzzy库的 ratio()函数比前面自己写的编辑距离算法,准确度高了很多。
process模块
process模块则是进一步的封装,可以直接获取相似度最高的值和相似度:
from fuzzywuzzy import process
extract提取多条数据:
users = data.user.to_list()
a = process.extract('河南美锐信阳潢川四中分店', users, limit=10)
a = pd.DataFrame(a, columns=["名称","相似度"])
a
result = find.name.apply(lambda x:next(zip(*process.extract(x, users, limit=3)))).apply(pd.Series)
result.rename(columns=lambda i:f"匹配{i+1}", inplace=True)
result = pd.concat([find.drop(columns="result"), result], axis=1)
result
虽然可能有个别正确结果这5个都不是,但整体来说为人工筛查节省了大量时间。
整体代码
from fuzzywuzzy import process
import pandas as pd
excel = pd.ExcelFile("所有客户.xlsx")
data = excel.parse(0)
find = excel.parse(1)
users = data.user.to_list()
result = find.name.apply(lambda x:next(zip(*process.extract(x, users, limit=3)))).apply(pd.Series)
result.rename(columns=lambda i:f"匹配{i+1}", inplace=True)
result = pd.concat([find, result], axis=1)
result
result =[]for corpus in find_corpus.values:
sim = pd.Series(index[corpus])
result.append(data.user[sim.nlargest(3).index].values)
result = pd.DataFrame(result)
result.rename(columns=lambda i:f"匹配{i+1}", inplace=True)
result = pd.concat([find.drop(columns="result"), result], axis=1)
result.head(30)
完整代码
from gensim import corpora, similarities, models
import jieba
import pandas as pd
data = pd.read_csv("所有客户.csv", encoding="gbk")
find = pd.read_csv("被查找的客户.csv", encoding="gbk")
data_split_word = data.user.apply(jieba.lcut)
dictionary = corpora.Dictionary(data_split_word.values)
data_corpus = data_split_word.apply(dictionary.doc2bow)
trantab =str.maketrans("0123456789","零一二三四五六七八九")
find_corpus = find.name.apply(lambda x: dictionary.doc2bow(jieba.lcut(x.translate(trantab))))
tfidf = models.TfidfModel(data_corpus.to_list())
index = similarities.SparseMatrixSimilarity(
tfidf[data_corpus], num_features=len(dictionary))
result =[]for corpus in find_corpus.values:
sim = pd.Series(index[corpus])
result.append(data.user[sim.nlargest(3).index].values)
result = pd.DataFrame(result)
result.rename(columns=lambda i:f"匹配{i+1}", inplace=True)
result = pd.concat([find, result], axis=1)
result.head(30)