A deep dive into git internals.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
git-guts-fs/git-guts/Verify.fs

183 lines
8.6 KiB

namespace git_guts
open System
open System.IO
open System.Text
open System.Text.RegularExpressions
// --------------------------------------------------------------------
[<AutoOpen>]
module VerifyObjects =
let private _adjustGitTreeOutput (objDump: string) =
// adjust git's output for TREE objects so that it matches what we output
let regex = Regex( @"^(\d+) [a-z]+ ([0-9a-f]{40})\s+(.+)$" )
let buf = new StringBuilder()
let reader = new StreamReader( new MemoryStream( Encoding.UTF8.GetBytes( objDump ) ) )
while not reader.EndOfStream do
let line = reader.ReadLine()
let matches = regex.Matches( line )
let groups = matches.[0].Groups
let perms, objName = groups.[1].Value, groups.[2].Value
let mutable path = groups.[3].Value
// FUDGE! git outputs UTF-8 bytes as octal-encoded strings, not the bytes themselves?!
if path.[0] = '"' && path.[path.Length-1] = '"' then
let rec updatePath (partialPath: string) = seq {
let pos = partialPath.IndexOf( '\\' )
if pos < 0 then
yield! Encoding.UTF8.GetBytes( partialPath )
else
yield! Encoding.UTF8.GetBytes( partialPath.Substring( 0, pos ) )
let n = Convert.ToByte( partialPath.Substring( pos+1, 3 ), 8 )
yield byte( n )
yield! updatePath ( partialPath.Substring( pos+4 ) )
}
path <- Encoding.UTF8.GetString(
updatePath ( path.Substring( 1, path.Length-2 ) ) |> Seq.toArray
)
buf.AppendFormat( "{0} {1} {2}\n", perms, objName, path ) |> ignore
buf.ToString().TrimEnd()
let verifyObjects repoDir progress =
// NOTE: This will iterate over every object in a repo, and compare what we retrieve with what
// "git cat-file" returns. In particular, this will include *every* revision of *every* file,
// so for large repo's, it will take some time...
// initialize
disableSpectreCapabilities ()
let mutable currPackFname = ""
let mutable packObjCounts = Map[ ( "", 0 ) ]
let onEndPackFile () =
let nObjs = packObjCounts.[ currPackFname ]
printfn "- Checked %s." ( plural nObjs "object" "objects" )
// check each object in the repo
printfn "Checking loose objects..." // nb: because getObjNames returns loose objects first
for objName, fname in getObjNames repoDir do
// check if we have a loose object or an object in a pack
let packFname =
if Path.GetExtension( fname ) = ".pack" then Path.GetFileName( fname ) else ""
// check if we've started a new pack
if packFname <> currPackFname then
// yup - log the end of the current one
onEndPackFile ()
// prepare to start processing the new pack file
currPackFname <- packFname
packObjCounts <- packObjCounts.Add ( currPackFname, 0 )
let fsize = FileInfo( fname ).Length
printfn ""
printfn "Checking pack file (%s): %s" (friendlyByteCount fsize) currPackFname
// find the next object
if progress then
eprintfn "- Checking object: %s" objName
let objRec = _findRepoObjRec repoDir objName
if objRec.IsNone then
failwithf "Can't find object: %s" objName
let mutable objData = objRec.Value.objData
let obj = makeGitObject objRec.Value
packObjCounts <- packObjCounts.Add ( currPackFname, packObjCounts.[currPackFname]+1 )
if progress then
eprintfn " - Got %s: #bytes=%d" obj.objType objData.Length
// check the object type
let expectedObjType = ( runGitText repoDir "cat-file" [ "-t"; objName ] ).TrimEnd()
if obj.objType <> expectedObjType then
failwithf "Object type mismatch for %s: got \"%s\", expected \"%s\"." objName obj.objType expectedObjType
// check the object data
let mutable expectedObjData = ( runGit repoDir "cat-file" [ "-p"; objName ] )
if obj.objType = "tree" then
objData <- Encoding.UTF8.GetBytes(
using ( new CaptureStdout() ) ( fun cap ->
obj.dumpObj()
cap.getOutput.TrimEnd()
)
)
expectedObjData <- Encoding.UTF8.GetBytes(
_adjustGitTreeOutput ( Encoding.UTF8.GetString expectedObjData )
)
if objData <> expectedObjData then
let dname = Path.GetTempPath()
File.WriteAllBytes( Path.Join( dname, "git-content.expected" ), expectedObjData )
File.WriteAllBytes( Path.Join( dname, "git-content.actual" ), objData )
failwithf "Object data mismatch for %s." objName
onEndPackFile ()
// NOTE: These functions generate object names that are invalid (they contain a non-hex character),
// but they will always compare greater/less than a valid name, based on the first byte, which will
// help test how we use the fanout table, and the binary search through the table of object names.
let makeObjName1 byte0 =
sprintf "%02x%s!" byte0 (String('0',37))
let makeObjName2 byte0 =
sprintf "%02x%sz" byte0 (String('f',37))
// NOTE: Also test with an object name that appears in the middle of the range, for a given first byte.
let makeObjName3 byte0 =
sprintf "%02x%s" byte0 "80808080808080808080808080808080808080"
// verify looking up unknown objects
printfn ""
printfn "Checking unknown objects..."
let mutable nObjs = 0
[| makeObjName1; makeObjName2; makeObjName3 |] |> Seq.iter ( fun makeObjName ->
for byte0 = 0 to 255 do
let objName = makeObjName byte0
let obj = findRepoObject repoDir objName
if obj.IsSome then
failwithf "Unexpectedly found object: %s" objName
nObjs <- nObjs + 1
)
printf "- Checked %s." ( plural nObjs "unknown object" "unknown objects." )
// --------------------------------------------------------------------
[<AutoOpen>]
module VerifyLogs =
let verifyLogs repoDir =
// verify reading each log file
for ref, fname in _findLogFiles repoDir do
printfn "Processing log file: %s" fname
// run git to get the log entries for the current ref
let ref2 =
let ref2 = ref.Replace( Path.DirectorySeparatorChar, '/' )
if ref2.Length >= 12 && ref2.Substring( 0, 11 ) = "refs/heads/" then
ref2.Substring( 11 )
else
ref2
let expected = ( runGitText repoDir "reflog" [| "show"; ref2 |] ).TrimEnd()
// NOTE: git shows just enough of the object names for them to be unique, so we need
// to figure out how much that is, so that we can generate the same output :-/
// We assume the first line is a log entry, that starts with an abbreviated object name.
let objNamePrefixLen = expected.IndexOf( ' ' )
// extract the log entries for the current ref
let buf = new StringBuilder()
let mutable nLogEntries = 0
_readLogFile fname |> Seq.rev |> Seq.iteri ( fun logEntryNo logEntry ->
if logEntry.nextRef.IsSome then
let objNamePrefix = logEntry.nextRef.Value.Substring( 0, objNamePrefixLen )
buf.AppendFormat( "{0} {1}@{{{2}}}: {3}", objNamePrefix, ref2, logEntryNo, logEntry.entryType.Value ) |> ignore
if logEntry.msg.IsSome then
buf.AppendFormat( ": {0}", logEntry.msg.Value ) |> ignore
buf.AppendLine( "" ) |> ignore
nLogEntries <- nLogEntries + 1
)
let output = buf.ToString().TrimEnd()
// compare what we extracted with the git output
if output <> expected then
let dname = Path.GetTempPath()
File.WriteAllText( Path.Join( dname, "git-log.expected" ), expected )
File.WriteAllText( Path.Join( dname, "git-log.actual" ), output )
failwithf "Mismatched output for ref: %s" ref2
printfn "- Checked %s." ( plural nLogEntries "log entry" "log entries" )