ASP比较文章标题相似度,用于删除网站相似文章数据

'比较文章标题相似函数
Function CheckSimilar(Str1,Str2)
	'原理1.从字符数较少的String中分别取出每个字符
	'2.把检测每个字符在字符数较多的String出现次数
	'3.CheckSimilar=出现次数/字符数 就是相似率
	Dim Lenmax
	Dim Lenmin
	Dim leni
	Dim Qingjing
	Dim icc
	leni     = 0
	Str1     = ReplaceDot(Str1)
	Str2     = ReplaceDot(Str2)
	Qingjing = 1

	If Str1 <> "" And Str2 <> "" Then

		If Len(Str1) >= Len(Str2) Then
			Lenmax   = Len(Str1)
			Lenmin   = Len(Str2)
		Else
			Lenmax   = Len(Str2)
			Lenmin   = Len(Str1)
			Qingjing = 1
		End If

		If Qingjing = 1 Then

			For icc = 1 To Lenmin

				If InStr(Str2,Mid(str1,icc,1)) > 0 Then
					leni = leni + 1
				End If

			Next

		Else

			For icc = 1 To Lenmin

				If InStr(Str1,Mid(str1,icc,1)) > 0 Then
					leni = leni + 1
				End If

			Next

		End If

		Lenmin       = Lenmin - 1
		CheckSimilar = leni / Lenmax
		CheckSimilar = FormatNumber(CheckSimilar,2) '格式化成两位小数
	End If

End Function

'下面这个函数用来清除一些混淆字符,也可根据实际,自己另行加入一些情况
Function ReplaceDot(title)'去掉标题中加入混乱符号:#@¥%#……&*
	Dim re_dot
	Set re_dot = New RegEXP
	re_dot.pattern = "[^a-zA-Z0-9\u4e00-\u9fa5]"
	re_dot.IgnoreCase = True
	re_dot.global = True
	title = re_dot.Replace(title,"")
	ReplaceDot = title
	Set re_dot = Nothing
End Function

'检测一下输出结果,大于0.3的标题就属于重复内容

If CheckSimilar("小庄天下第一商城附近的英语培训班","东大桥蓝岛大厦附近的英语培训班") > 0.3 Then
	Response.Write "你已经发布过同类信息,不要重复发布!"
End If