之前拿到一堆源码文件,在整理的时候需要计算一些文件之间的相似率。与之相关第一个想到的就是Linux上的” diff “命令。把diff命令的输出拿过来简单计算一下,就有结果了。于是用Python调用一下命令行,短短几行代码,就OK了。
# !/usr/bin/python
# -*- coding: utf-8 -*-
"""
create_author : 蛙鳜鸡鹳狸猿
create_time : 2017-06-01
program : *_* evaluate the similarity between two files *_*
"""
import commands
def simieval(lf, rf):
"""
UDF of evaluating the similarity between two files.
Rely on "diff" handler on Linux platform.
Default to ignore blanks by "-bB".
:param lf: string
File to diff at left.
:param rf: string
File to diff at right.
:return: float
Similarity ratio between two files.
"""
le = float(commands.getoutput("diff -bB %s %s | awk 'BEGIN {cnt = 0}; /^[0-9]/ {cnt -= 1}; /^[<>]/ {cnt += 1}; END {print cnt}'" % (lf, rf)))
re = float(commands.getoutput("nl %s %s | awk '{print $2}' | grep -v '^$' | wc -l" % (lf, rf)))
result = 1 - le / re
return result
if __name__ == "__main__":
writer = "/home/student/"
ll = """
Ubuntu
MySQL
C
JAVA
Shell
"""
wl = open(writer + "lf", 'w')
wl.writelines(ll)
wl.close()
lr = """
CentOS
MySQL
C
Python
"""
wr = open(writer + "rf", 'w')
wr.writelines(lr)
wr.close()
se = simieval(lf=writer + "lf", rf=writer + "rf")
print(se)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58