defjaccard_similarity(s1: str, s2: str)->float: """Computes the Jaccard similarity score between s1 and s2.""" returnlen(set(s1) & set(s2)) / len(set(s1) | set(s2))
if __name__ == "__main__": s1 = "String matching is not easy" s2 = "Compare two strings is not easy" jaccard_sim = jaccard_similarity(s1, s2) print(f"The Jaccard similarity between {s1} and {s2} is {jaccard_sim}")
Jaccard距离的优点是实现简单,速度快,相应的,由于这种方法并不考虑字符的顺序,可靠性不高。
最长公共子字符串百分比
最长公共子字符串是指两个字符串中所共有的最长子字符串。
它的计算方式也很容易理解,即将相似性度量为最长公共子字符串的长度与两个字符串间最小长度的比。
以下是Python中计算最长公共子字符串百分比的实现:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
from difflib import SequenceMatcher
deflongest_common_substring(s1: str, s2: str) -> str: """Computes the longest common substring of s1 and s2""" seq_matcher = SequenceMatcher(isjunk=None, a=s1, b=s2) match = seq_matcher.find_longest_match(0, len(s1), 0, len(s2))
deflongest_common_substring_percentage(s1 : str, s2 : str) -> float: """Computes the longest common substring percentage of s1 and s2""" assertmin(len(s1), len(s2)) > 0, "One of the given string is empty" returnlen(longest_common_substring(s1, s2))/min(len(s1), len(s2))