ASP比较文章标题相似度,用于删除网站相似文章数据
'比较文章标题相似函数
Function CheckSimilar(Str1,Str2)
'原理1.从字符数较少的String中分别取出每个字符
'2.把检测每个字符在字符数较多的String出现次数
'3.CheckSimilar=出现次数/字符数 就是相似率
Dim Lenmax
Dim Lenmin
Dim leni
Dim Qingjing
Dim icc
leni = 0
Str1 = ReplaceDot(Str1)
Str2 = ReplaceDot(Str2)
Qingjing = 1
If Str1 <> "" And Str2 <> "" Then
If Len(Str1) >= Len(Str2) Then
Lenmax = Len(Str1)
Lenmin = Len(Str2)
Else
Lenmax = Len(Str2)
Lenmin = Len(Str1)
Qingjing = 1
End If
If Qingjing = 1 Then
For icc = 1 To Lenmin
If InStr(Str2,Mid(str1,icc,1)) > 0 Then
leni = leni + 1
End If
Next
Else
For icc = 1 To Lenmin
If InStr(Str1,Mid(str1,icc,1)) > 0 Then
leni = leni + 1
End If
Next
End If
Lenmin = Lenmin - 1
CheckSimilar = leni / Lenmax
CheckSimilar = FormatNumber(CheckSimilar,2) '格式化成两位小数
End If
End Function
'下面这个函数用来清除一些混淆字符,也可根据实际,自己另行加入一些情况
Function ReplaceDot(title)'去掉标题中加入混乱符号:#@¥%#……&*
Dim re_dot
Set re_dot = New RegEXP
re_dot.pattern = "[^a-zA-Z0-9\u4e00-\u9fa5]"
re_dot.IgnoreCase = True
re_dot.global = True
title = re_dot.Replace(title,"")
ReplaceDot = title
Set re_dot = Nothing
End Function
'检测一下输出结果,大于0.3的标题就属于重复内容
If CheckSimilar("小庄天下第一商城附近的英语培训班","东大桥蓝岛大厦附近的英语培训班") > 0.3 Then
Response.Write "你已经发布过同类信息,不要重复发布!"
End If