Zwei Textdateien vergleichen
Verfasst: 12.04.2011 12:33
				
				Auf vielfachen Wunsch eines einzelnen, habe ich mich mal rangesetzt
und den Diff-Algo aus meinem History-Viewer extrahiert.
Es sind da doch einige Veränderungen nötig gewesen, um das für
die Allgemeinheit tauglich zu machen.
Ich lege also nicht die Hand dafür ins Feuer, dass da nicht noch Fehler drin sind.
Der Diff-Algo basiert auf einem C#-Code von Matthias Hertel.
Das ganze ist (mal wieder) als Interface aufgebaut, damit es möglichst wenig mit einem Hauptprogramm kollidiert.
Es wird mindestens PB4.5 benötigt.
GetTextDiff.pbi
Beispiel:
			und den Diff-Algo aus meinem History-Viewer extrahiert.
Es sind da doch einige Veränderungen nötig gewesen, um das für
die Allgemeinheit tauglich zu machen.
Ich lege also nicht die Hand dafür ins Feuer, dass da nicht noch Fehler drin sind.
Der Diff-Algo basiert auf einem C#-Code von Matthias Hertel.
Das ganze ist (mal wieder) als Interface aufgebaut, damit es möglichst wenig mit einem Hauptprogramm kollidiert.
Es wird mindestens PB4.5 benötigt.
GetTextDiff.pbi
Code: Alles auswählen
;/--------------------------
;|
;| GetTextDiff.pbi V1.00
;|                 [12.04.2011]
;|
;| ©HeX0R 2011
;|
;| Include to compare two
;| Textfiles and show
;| the differences
;|
;| TextDiff-Algorithm is ported from
;| this C#-Code from Matthias Hertel:
;| http://www.mathertel.de/Diff/Default.aspx
;|
;/--------------------------
CompilerIf #PB_Compiler_Version < 450
	CompilerError "This include needs at least PureBasic 4.5!"
CompilerEndIf
#DIFF_INSERT_BLOCK = $01
#DIFF_DELETE_BLOCK = $02
Interface InterfaceGetTextDiff
	GetTextDiff(OriginalFile.s, FileToCheck.s)
	CountDiffBlocks()
	DiffBlockType(DiffBlockNum)
	DiffBlockSize(DiffBlockNum, BlockType)
	DiffBlockLine.s(DiffBlockNum, BlockType, LineNum, *RealLineNum.INTEGER = 0)
	GetOriginalFileLine.s(Num)
	GetCheckFileLine.s(Num)
EndInterface
Structure _DIFF_
	S.s
	I.l
	modified.b
EndStructure
Structure _aITEM_
	StartA.l
	StartB.l
	DeletedA.l
	InsertedB.l
EndStructure
Structure _GETTEXTDIFF_STRUC_
	VTable.i
	LinesOrig.i
	LinesCheck.i
	DiffBlocks.i
	Array LineOrig._DIFF_(5000)  ;will increase dynamically if needed
	Array LineCheck._DIFF_(5000) ;will increase dynamically if needed
	Array MyRes._aITEM_(500)     ;will increase dynamically if needed
EndStructure
Procedure GTD_CountDiffBlocks(*THIS._GETTEXTDIFF_STRUC_)
	;/----------------
	;| How many Diffblocks are there?
	;/----------------
	ProcedureReturn *THIS\DiffBlocks
EndProcedure
	
Procedure GTD_DiffBlockType(*THIS._GETTEXTDIFF_STRUC_, DiffBlockNum)
	;/-------------
	;| Which Type is this Block
	;|
	;| You get a combination of #DIFF_DELETE_BLOCK and/or #DIFF_INSERT_BLOCK
	;/-------------
	Protected Result
	
	If DiffBlockNum >= 0 And DiffBlockNum < *THIS\DiffBlocks
		If *THIS\MyRes(DiffBlockNum)\DeletedA
			Result | #DIFF_DELETE_BLOCK
		EndIf
		If *THIS\MyRes(DiffBlockNum)\InsertedB
			Result | #DIFF_INSERT_BLOCK
		EndIf
	EndIf
	
	ProcedureReturn Result
EndProcedure
Procedure GTD_DiffBlockSize(*THIS._GETTEXTDIFF_STRUC_, DiffBlockNum, BlockType)
	;/-------------
	;| Get the size of this DiffBlock
	;|
	;| One DiffBlock can contain both, a deleteblock and an insertblock
	;| So you have to specify with BlockType, which block you need information from
	;/-------------
	Protected Result
	
	If DiffBlockNum >= 0 And DiffBlockNum < *THIS\DiffBlocks
		If BlockType = #DIFF_DELETE_BLOCK
			Result = *THIS\MyRes(DiffBlockNum)\DeletedA
		ElseIf BlockType = #DIFF_INSERT_BLOCK
			Result = *THIS\MyRes(DiffBlockNum)\InsertedB
		EndIf
	EndIf
	
	ProcedureReturn Result
EndProcedure
Procedure.s GTD_DiffBlockLine(*THIS._GETTEXTDIFF_STRUC_, DiffBlockNum, BlockType, LineNum, *RealLineNum.INTEGER = 0)
	;/---------------
	;| Get a line of a DifferenceBlock
	;|
	;| You have to specify, which kind of block you are interested in with BlockType
	;| LineNum is the Number within this Block, starting with 0
	;| You can optional receive the real LineNumber in *RealLineNum
	;/---------------
	Protected Result.s
	
	If DiffBlockNum >= 0 And DiffBlockNum < *THIS\DiffBlocks
		If BlockType = #DIFF_DELETE_BLOCK
			If LineNum >= 0 And LineNum < *THIS\MyRes(DiffBlockNum)\DeletedA
				Result = *THIS\LineOrig(*THIS\MyRes(DiffBlockNum)\StartA + LineNum)\S
				If *RealLineNum
					*RealLineNum\i = *THIS\MyRes(DiffBlockNum)\StartA + LineNum + 1
				EndIf
			EndIf
		ElseIf BlockType = #DIFF_INSERT_BLOCK
			If LineNum >= 0 And LineNum < *THIS\MyRes(DiffBlockNum)\InsertedB
				Result = *THIS\LineCheck(*THIS\MyRes(DiffBlockNum)\StartB + LineNum)\S
				If *RealLineNum
					*RealLineNum\i = *THIS\MyRes(DiffBlockNum)\StartB + LineNum + 1
				EndIf
			EndIf
		EndIf
	EndIf
	
	ProcedureReturn Result
EndProcedure
Procedure.s GTD_GetOriginalFileLine(*THIS._GETTEXTDIFF_STRUC_, Num)
	;/-----------
	;| Get the Line (Num) of the OriginalFile (starts with 0)
	;/-----------
	Protected Result.s
	
	If Num >= 0 And Num < *THIS\LinesOrig
		Result = *THIS\LineOrig(Num)\S
	EndIf
	
	ProcedureReturn Result
EndProcedure
Procedure.s GTD_GetCheckFileLine(*THIS._GETTEXTDIFF_STRUC_, Num)
	;/-----------
	;| Get the Line (Num) of the File to check (starts with 0)
	;/-----------
	Protected Result.s
	
	If Num >= 0 And Num < *THIS\LinesCheck
		Result = *THIS\LineCheck(Num)\S
	EndIf
	
	ProcedureReturn Result
EndProcedure
	
Procedure TextSearch_SMS(*THIS._GETTEXTDIFF_STRUC_, LowerA, UpperA, LowerB, UpperB, SizeA, SizeB, *ret.POINT, Array DownVector.i(1), Array UpVector.i(1))
	;Internal Procedure for Diff-Algorithm
	Protected MAX, UpOffset, DownOffset, DownK, UpK, Delta, oddDelta
	Protected MaxD, D, k, x, y
	MAX      = SizeA + SizeB + 1
	DownK    = LowerA - LowerB ;the k-line to start the forward search
	UpK      = UpperA - UpperB ;the k-line to start the reverse search
	Delta    = (UpperA - LowerA) - (UpperB - LowerB)
	If Delta & 1
		oddDelta = #True
	EndIf
	;// The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
	;// And are access using a specific offset: UpOffset UpVector And DownOffset For DownVektor
	DownOffset = MAX - DownK
	UpOffset   = MAX - UpK
	MaxD       = ((UpperA - LowerA + UpperB - LowerB) / 2) + 1
	;Debug.Write(2, "SMS", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", LowerA, UpperA, LowerB, UpperB));
	;init vectors
	DownVector(DownOffset + DownK + 1) = LowerA
	UpVector(UpOffset + UpK - 1)       = UpperA
	For D = 0 To MaxD
		;Extend the forward path.
		For k = DownK - D To DownK + D Step 2
			;find the only Or better starting point
			If k = DownK - D
				x = DownVector(DownOffset + k + 1)
			Else
				x = DownVector(DownOffset + k - 1) + 1
				If k < DownK + D And DownVector(DownOffset + k + 1) >= x
					x = DownVector(DownOffset + k + 1)
				EndIf
			EndIf
			y = x - k
			;find the End of the furthest reaching forward D-path in diagonal k.
			While x < UpperA And y < UpperB And *THIS\LineOrig(x)\I = *THIS\LineCheck(y)\I
				x + 1
				y + 1
			Wend
			DownVector(DownOffset + k) = x
			;overlap ?
			If oddDelta And UpK - D < k And k < UpK + D
				If UpVector(UpOffset + k) <= DownVector(DownOffset + k)
					*ret\x = DownVector(DownOffset + k)
					*ret\y = DownVector(DownOffset + k) - k
					Break 2
				EndIf
			EndIf
		Next k
		;Extend the reverse path.
		For k = UpK - D To UpK + D Step 2
			;find the only Or better starting point
			If k = UpK + D
				x = UpVector(UpOffset + k - 1) ;up
			Else
				x = UpVector(UpOffset + k + 1) - 1 ;left
				If k > UpK - D And UpVector(UpOffset + k - 1) < x
					x = UpVector(UpOffset + k - 1) ;up
				EndIf
			EndIf
			y = x - k;
			While x > LowerA And y > LowerB And *THIS\LineOrig(x - 1)\I = *THIS\LineCheck(y - 1)\I
				x - 1
				y - 1 ;diagonal
			Wend
			UpVector(UpOffset + k) = x
			;overlap ?
			If oddDelta = 0 And DownK - D <= k And k <= DownK + D
				If UpVector(UpOffset + k) <= DownVector(DownOffset + k)
					*ret\x = DownVector(DownOffset + k)
					*ret\y = DownVector(DownOffset + k) - k
					Break 2
				EndIf
			EndIf
		Next k
	Next D
EndProcedure
Procedure TextSearch_LCS(*THIS._GETTEXTDIFF_STRUC_, LowerA, UpperA, LowerB, UpperB, SizeA, SizeB, Array A1.i(1), Array B1.i(1))
	;Internal Procedure for Diff-Algorithm
	Protected Ret.POINT
	;Fast walkthrough equal lines at the start
	While LowerA < UpperA And LowerB < UpperB And *THIS\LineOrig(LowerA)\I = *THIS\LineCheck(LowerB)\I
		LowerA + 1
		LowerB + 1
	Wend
	;Fast walkthrough equal lines at the End
	While LowerA < UpperA And LowerB < UpperB And *THIS\LineOrig(UpperA - 1)\I = *THIS\LineCheck(UpperB - 1)\I
		UpperA - 1
		UpperB - 1
	Wend
	If LowerA = UpperA
		;mark As inserted lines.
		While LowerB < UpperB
			*THIS\LineCheck(LowerB)\modified = #True
			LowerB + 1
		Wend
	ElseIf LowerB = UpperB
		;mark As deleted lines.
		While LowerA < UpperA
			*THIS\LineOrig(LowerA)\modified = #True
			LowerA + 1
		Wend
	Else
		;Find the middle snakea And length of an optimal path For A And B
		TextSearch_SMS(*THIS, LowerA, UpperA, LowerB, UpperB, SizeA, SizeB, @Ret, A1(), B1())
		;The path is from LowerX To (x,y) And (x,y) ot UpperX
		TextSearch_LCS(*THIS, LowerA, Ret\x, LowerB, Ret\y, SizeA, SizeB, A1(), B1())
		TextSearch_LCS(*THIS, Ret\x, UpperA, Ret\y, UpperB, SizeA, SizeB, A1(), B1())
	EndIf
EndProcedure
Procedure TextSearch_Optimize(*THIS._GETTEXTDIFF_STRUC_)
	;Internal Procedure for Diff-Algorithm
	Protected StartPos, EndPos, Size
	
	Size = *THIS\LinesOrig + 1
	While StartPos < Size
		While StartPos < Size And *THIS\LineOrig(StartPos)\modified = 0
			StartPos + 1
		Wend
		EndPos = StartPos
		While EndPos < Size And *THIS\LineOrig(EndPos)\modified
			EndPos + 1
		Wend
		If EndPos < Size And *THIS\LineOrig(StartPos)\I = *THIS\LineOrig(EndPos)\I
			*THIS\LineOrig(StartPos)\modified = #False
			*THIS\LineOrig(EndPos)\modified   = #True
		Else
			StartPos = EndPos
		EndIf
	Wend
	
	StartPos = 0
	EndPos   = 0
	Size     = *THIS\LinesCheck + 1
	
	While StartPos < Size
		While StartPos < Size And *THIS\LineCheck(StartPos)\modified = 0
			StartPos + 1
		Wend
		EndPos = StartPos
		While EndPos < Size And *THIS\LineCheck(EndPos)\modified
			EndPos + 1
		Wend
		If EndPos < Size And *THIS\LineCheck(StartPos)\I = *THIS\LineCheck(EndPos)\I
			*THIS\LineCheck(StartPos)\modified = #False
			*THIS\LineCheck(EndPos)\modified   = #True
		Else
			StartPos = EndPos
		EndIf
	Wend
EndProcedure
Procedure TextSearch_CreateDiffs(*THIS._GETTEXTDIFF_STRUC_)
	;Internal Procedure for Diff-Algorithm
	Protected SizeA, SizeB, StartA, StartB, LineA, LineB
	SizeA = *THIS\LinesOrig + 1
	SizeB = *THIS\LinesCheck + 1
	
	While LineA < SizeA Or LineB < SizeB
		If LineA < SizeA And *THIS\LineOrig(LineA)\modified = 0 And LineB < SizeB And *THIS\LineCheck(LineB)\modified = 0
			;equal lines
			LineA + 1
			LineB + 1
		Else
			;maybe deleted And/Or inserted lines
			StartA = LineA
			StartB = LineB
			While LineA < SizeA And (LineB >= SizeB Or *THIS\LineOrig(LineA)\modified)
				LineA + 1
			Wend
			While LineB < SizeB And (LineA >= SizeA Or *THIS\LineCheck(LineB)\modified)
				LineB + 1
			Wend
			If StartA < LineA Or StartB < LineB
				;store a new difference-item
				*THIS\DiffBlocks + 1
				If ArraySize(*THIS\MyRes()) < *THIS\DiffBlocks
					ReDim *THIS\MyRes._aITEM_(ArraySize(*THIS\MyRes()) + 500)
				EndIf
				*THIS\MyRes(*THIS\DiffBlocks - 1)\StartA    = StartA
				*THIS\MyRes(*THIS\DiffBlocks - 1)\StartB    = StartB
				*THIS\MyRes(*THIS\DiffBlocks - 1)\DeletedA  = LineA - StartA
				*THIS\MyRes(*THIS\DiffBlocks - 1)\InsertedB = LineB - StartB
			EndIf
		EndIf
	Wend
EndProcedure
Procedure GTD_GetTextDiff(*THIS._GETTEXTDIFF_STRUC_, OriginalFile.s, FileToCheck.s)
	;/--------------
	;| Main Procedure
	;| Loads two textfiles and will immediately
	;| compare them and store differences
	;|
	;| Use the other procedures two see results easily.
	;|
	;/--------------
	Protected Result, FID, uID, j, k, Max, BOM
	Protected ArraySize_Orig, ArraySize_Check
	*THIS\LinesOrig  = -1
	*THIS\LinesCheck = -1
	ArraySize_Orig   = ArraySize(*THIS\LineOrig())
	ArraySize_Check  = ArraySize(*THIS\LineCheck())
	
	FID = ReadFile(#PB_Any, OriginalFile)
	If FID
		BOM = ReadStringFormat(FID)
		While Eof(FID) = 0
			*THIS\LinesOrig + 1
			If *THIS\LinesOrig > ArraySize_Orig
				ReDim *THIS\LineOrig._DIFF_(ArraySize_Orig + 5000)
				ArraySize_Orig + 5000
			EndIf
			*THIS\LineOrig(*THIS\LinesOrig)\S = ReadString(FID, BOM)
			k                           = #True
			For j = 0 To *THIS\LinesOrig - 1
				If *THIS\LineOrig(j)\S = *THIS\LineOrig(*THIS\LinesOrig)\S
					*THIS\LineOrig(*THIS\LinesOrig)\I = *THIS\LineOrig(j)\I
					k                     = #False
					Break
				EndIf
			Next j
			If k
				*THIS\LineOrig(*THIS\LinesOrig)\I = uID
				uID + 1
			EndIf
		Wend
		CloseFile(FID)
	EndIf
	FID = ReadFile(#PB_Any, FileToCheck)
	If FID = 0
		ProcedureReturn 0
	EndIf
	BOM              = ReadStringFormat(FID)
	While Eof(FID) = 0
		*THIS\LinesCheck + 1
		If *THIS\LinesCheck > ArraySize_Check
			ReDim *THIS\LineCheck._DIFF_(ArraySize_Check + 5000)
			ArraySize_Check + 5000
		EndIf
		*THIS\LineCheck(*THIS\LinesCheck)\S = ReadString(FID, BOM)
		k = #True
		For j = 0 To *THIS\LinesCheck - 1
			If *THIS\LineCheck(j)\S = *THIS\LineCheck(*THIS\LinesCheck)\S
				*THIS\LineCheck(*THIS\LinesCheck)\I = *THIS\LineCheck(j)\I
				k                       = #False
				Break
			EndIf
		Next j
		If k
			For j = 0 To *THIS\LinesOrig
				If *THIS\LineOrig(j)\S = *THIS\LineCheck(*THIS\LinesCheck)\S
					*THIS\LineCheck(*THIS\LinesCheck)\I = *THIS\LineOrig(j)\I
					k                             = #False
					Break
				EndIf
			Next j
			If k
				*THIS\LineCheck(*THIS\LinesCheck)\I = uID
				uID + 1
			EndIf
		EndIf
	Wend
	CloseFile(FID)
	MAX = *THIS\LinesOrig + *THIS\LinesCheck + 1
	Dim DV.i(2 * MAX + 2)
	Dim UV.i(2 * MAX + 2)
	TextSearch_LCS(*THIS, 0, *THIS\LinesOrig + 1, 0, *THIS\LinesCheck + 1, *THIS\LinesOrig + 1, *THIS\LinesCheck + 1, DV(), UV())
	TextSearch_Optimize(*THIS)
	*THIS\DiffBlocks = 0
	TextSearch_CreateDiffs(*THIS)
	Result = *THIS\DiffBlocks
	
	ProcedureReturn Result
EndProcedure
Procedure GetTextDiff_CreateInterface()
	;Create the damn Interface
	Protected *G._GETTEXTDIFF_STRUC_
	
	*G = AllocateMemory(SizeOf(_GETTEXTDIFF_STRUC_))
	If *G
		InitializeStructure(*G, _GETTEXTDIFF_STRUC_)
		*G\VTable = ?_TEXTDIFF_PROCEDURES_
	EndIf
	
	ProcedureReturn *G
EndProcedure
DataSection
	_TEXTDIFF_PROCEDURES_:
	Data.i @GTD_GetTextDiff()
	Data.i @GTD_CountDiffBlocks()
	Data.i @GTD_DiffBlockType()
	Data.i @GTD_DiffBlockSize()
	Data.i @GTD_DiffBlockLine()
	Data.i @GTD_GetOriginalFileLine()
	Data.i @GTD_GetCheckFileLine()
EndDataSection
Code: Alles auswählen
EnableExplicit
XIncludeFile "GetTextDiff.pbi"
;- Edit here
#File_Original = "C:\test.txt"
#File_ToCheck  = "C:\test2.txt"
Procedure main()
	Protected *TD.InterfaceGetTextDiff
	Protected i, j, k, LineNo, Type, Size, a$
	*TD.InterfaceGetTextDiff = GetTextDiff_CreateInterface()
	If *TD\GetTextDiff(#File_Original, #File_ToCheck)
		j = *TD\CountDiffBlocks()
		If j
			For i = 0 To j - 1
				Type = *TD\DiffBlockType(i)
				If Type & #DIFF_DELETE_BLOCK
					Size = *TD\DiffBlockSize(i, #DIFF_DELETE_BLOCK)
					Debug "Deleted following " + Str(Size) + " Line(s):"
					For k = 0 To Size - 1
						a$ = *TD\DiffBlockLine(i, #DIFF_DELETE_BLOCK, k, @LineNo)
						Debug "Line " + Str(LineNo) + ": " + a$
					Next k
				EndIf
				If Type & #DIFF_INSERT_BLOCK
					Size = *TD\DiffBlockSize(i, #DIFF_INSERT_BLOCK)
					Debug "Added following " + Str(Size) + " Line(s):"
					For k = 0 To Size - 1
						a$ = *TD\DiffBlockLine(i, #DIFF_INSERT_BLOCK, k, @LineNo)
						Debug "Line " + Str(LineNo) + ": " + a$
					Next k
				EndIf
			Next i
		EndIf
	EndIf
EndProcedure
main()