diff --git a/scrape.py b/scrape.py index 047e3a0..b2039b6 100644 --- a/scrape.py +++ b/scrape.py @@ -1,30 +1,52 @@ import os +from pathlib import Path +import re import shutil # #################################################################################################### # some alignment vars -offsetRelativeAddressCorrection = 0x01 -startOfFirstFilenameDefinitionAddress = 0x12 filenameDefinitionLength = 0x02 -relativeFilePermissionsAddress = 0x01 - offsetRelativeAddressCorrection -filePermissionsLength = 0x02 -relativeFileDateAddress = 0x03 - offsetRelativeAddressCorrection -fileDateLength = 0x04 -relativeTrueAddress = 0x02 - offsetRelativeAddressCorrection -trueLength = 0x01 -relativeFileNameLengthAddress = 0x03 - offsetRelativeAddressCorrection +unknownDefinition1Length = 0x02 +null2Length = 0x02 +unknownDefinition2Length = 0x04 +null1Length = 0x01 +continueReadingLength = 0x01 +#null2Length = 0x02 +numberOfDirectoryNamesLength = 0x01 +null3Length = 0x03 +#filenameDefinitionLength = 0x02 +unknownDefinition3Length = 0x02 +#null2Length = 0x02 +unknownDefinition4Length = 0x04 +#null1Length = 0x01 +#continueReadingLength = 0x01 +#null2Length = 0x02 fileNameLengthLength = 0x01 -relativeFileNameAddress = 0x04 - offsetRelativeAddressCorrection -#fileNameLength = 0x01 # this is captured dynamically within the _parent file read loop -relativeNextFileDefinitionAddress = 0x09 - offsetRelativeAddressCorrection +#null3Length = 0x03 +#fileNameLength = 0x?? +unknownDefinition5Length = 0x02 +unknownDefinition6Length = 0x02 +null4Length = 0x04 +fileTypeLength = 0x01 + +# this to help convert a byte to a bool (b'\x00' = 0, else = 1) +byteCompareForBool = b'\x00' + +# when reading the file type, these are the types ive observed +fileTypeMappingFile = b'\x05' +fileTypeMappingDir = b'\x04' # #################################################################################################### # walk through the filesystem +#testFileAttrName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/attr/_parent' +#testFileDataName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/data' testFileAttrName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/attr/_parent' testFileDataName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/data' # fuse mounted osd path +#testRoot = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#' +testRoot = '/mnt/test/5.7f_head/all/#5:fea38466:::100028f0fe1.00000000:head#' fuseRoot = '/mnt/test' # destination dirs @@ -39,111 +61,243 @@ relativeFolderStructureDir = '/attr' folderStructureFile = '_parent' dataFilename = 'data' +# placement group indicator extractor +placementGroupNameRegex = '(#.*?):+(.*?):+(.*?)\.(.*?):+(.*?)#$' +# g1 g2 g3 g4 g5 + +# determine number of paths to loop +#fuseRootDirs = os.walk(testRoot) +fuseRootDirs = os.walk(fuseRoot) + # walk through the fuse-mounted OSD -for fullPaths, dirNames, fileNames in os.walk(fuseRoot): +#for fullPaths in fuseRootDirs: +for fullPaths, _, _ in fuseRootDirs: # dont walk into excluded dirs if not exclusionDirs in fullPaths: # only walk into dirs which are attr dir if fullPaths.endswith(relativeFolderStructureDir): # at this point we've got the dirs we want - # now we can walk up 1 dir for the (assumed) placement group's main dir pgMainDir = os.path.normpath(os.path.dirname(fullPaths)) # join up the main dir with the folder structure file pgFolderStructureFile = os.path.normpath(os.path.join(pgMainDir, relativeFolderStructureDir[1:], folderStructureFile)) - # only proceed if the folder structure file exists - if os.path.exists(pgFolderStructureFile): - # join up the main dir with the data file - pgDataFile = os.path.normpath(os.path.join(pgMainDir, dataFilename)) + # extract the file's unique reference + pgDataFileRegexSearch = re.search(placementGroupNameRegex, Path(pgFolderStructureFile).parent.parent.name) + + # only proceed if a match has been made. this means that there _is_ a data file to grab + if pgDataFileRegexSearch: + pgDataFileUniqueIndicator = pgDataFileRegexSearch.group(3) + pgDataFileChunk = int(pgDataFileRegexSearch.group(4), 16) - # only proceed if the data file exists - if os.path.exists(pgDataFile): - # #################################################################################################### - # running the loop to scrape the file/folder structure + # only proceed if the folder structure file exists + if os.path.exists(pgFolderStructureFile): + # join up the main dir with the data file + pgDataFile = os.path.normpath(os.path.join(pgMainDir, dataFilename)) - # empty list for saving the path details - filePathInformation = [] - - # open the file readonly as a binary file - with open(pgFolderStructureFile, mode='rb') as file: - # get EOF seek address - seek to 0 bytes from the end of file (2) - file.seek(0, 2) - eofAddress = file.tell() + # only proceed if the data file exists + if os.path.exists(pgDataFile): + # #################################################################################################### + # running the loop to scrape the file/folder structure - # at the start of the file, we will load up the first address - file.seek(startOfFirstFilenameDefinitionAddress, 0) + # empty list for saving the path details + filePathInformation = [] - # seek through the binary file until our seek cursor is at the end of the file - while(file.tell() < eofAddress): - file.read(filenameDefinitionLength).hex(' ') - #print(file.read(filenameDefinitionLength).hex(' ')) - #todo: ensure that this value is 0x02 0x02 + # open the file readonly as a binary file + with open(pgFolderStructureFile, mode='rb') as file: + # get EOF seek address - seek to 0 bytes from the end of file (2) + file.seek(0, 2) + eofAddress = file.tell() - #todo: not sure if this the permission value - file.seek(relativeFilePermissionsAddress, 1) - file.read(filePermissionsLength).hex(' ') - #print(file.read(filePermissionsLength).hex(' ')) + # return to beginning of file - seek to 0 bytes from the beginning of file (0) + file.seek(0, 0) - #todo: not sure if this is the date value - file.seek(relativeFileDateAddress, 1) - file.read(fileDateLength).hex(' ') - #print(file.read(fileDateLength).hex(' ')) + # _parent file header + startDefinition = file.read(filenameDefinitionLength) + #print(startDefinition.hex(' ')) + #todo: ensure that this value is 0x05 0x04 - #todo: this _appears_ to be always true byte (ie. 0x01). perhaps it is a alignment byte? - file.seek(relativeTrueAddress, 1) - file.read(trueLength).hex(' ') - #print(file.read(trueLength).hex(' ')) - #todo: ensure that this value is 0x01 + # unknown block 1 + unknownDefinition1 = file.read(unknownDefinition1Length) + #print(unknownDefinition1.hex(' ')) - file.seek(relativeFileNameLengthAddress, 1) - fileNameLength = file.read(fileNameLengthLength) - #print(fileNameLength.hex(' ')) + # null with length 2 + file.read(null2Length) - file.seek(relativeFileNameAddress, 1) - fileName = file.read(int.from_bytes(fileNameLength)).decode('utf-8') - #print(fileName) + # unknown block 2 + unknownDefinition2 = file.read(unknownDefinition2Length) + #print(unknownDefinition2.hex(' ')) - # append the file name that we have captured to a list - if fileName: - filePathInformation.append(fileName) + # null with length 1 + file.read(null1Length) - # move to the next file/dir name definition - file.seek(relativeNextFileDefinitionAddress, 1) - - # #################################################################################################### - # handling the filename with the data file - - # add root dirs - filePathInformation.append(destRoot) - filePathInformation.append(mntDir) - - # first reverse the list so that it is easier to create dir structure - filePathInformation.reverse() - - # joins up all the dirs with the destination root dir. excludes filename - newDir = os.path.normpath(os.path.join(*filePathInformation[:-1])) - #print(newDir) - - newFile = os.path.normpath(os.path.join(*filePathInformation)) - #print(newFile) - - # #################################################################################################### - # FILE RECOVERY - - #BUG: a data file can be zero bytes. however i found that in the OSD, a zero byte data file can also be an empty folder - # in this case, ill skip them. by inverting the condition, i can process them if i find that im missing files - - # skip files data files that are 0 bytes - if not os.stat(pgDataFile).st_size == 0: - # make that dir - if not os.path.exists(newDir): - print('new dir: ' + newDir) - os.makedirs(newDir) + # continue reading byte + continueReading = file.read(continueReadingLength) != byteCompareForBool + #print(continueReading) + #todo: it should always be 0x01 here as we are in the header - # copy the data file to the fullpath file - if not os.path.exists(newFile): - print('old file: ' + pgDataFile) - print('new file: ' + newFile) - shutil.copyfile(pgDataFile, newFile) + # null with length 2 + file.read(null2Length) + + # number of directory names in path + numberOfDirectoryNames = file.read(numberOfDirectoryNamesLength) + #print(numberOfDirectoryNames.hex(' ')) + + # null with length 3 + file.read(null3Length) + + # seek through the binary file until our seek cursor is at the end of the file + #while(file.tell() < eofAddress): + while(file.tell() < eofAddress and continueReading): + # file header + startFileDefinition = file.read(filenameDefinitionLength) + #print(startFileDefinition.hex(' ')) + #todo: ensure that this value is 0x02 0x02 + + # unknown block 3 + unknownDefinition3 = file.read(unknownDefinition3Length) + #print(unknownDefinition3.hex(' ')) + + # null with length 2 + file.read(null2Length) + + # unknown block 4 + unknownDefinition4 = file.read(unknownDefinition4Length) + #print(unknownDefinition4.hex(' ')) + + # null with length 1 + file.read(null1Length) + + # continue reading byte + continueReading = file.read(continueReadingLength) != byteCompareForBool + #print(continueReading) + # this should stop the loop after reading the last file path + + # null with length 2 + file.read(null2Length) + + # length of the filename + fileNameLength = file.read(fileNameLengthLength) + #print(fileNameLength.hex(' ')) + + # null with length 3 + file.read(null3Length) + + # this is the filename. read length is based on above length of filename + fileName = file.read(int.from_bytes(fileNameLength)).decode('utf-8') + #print(fileName) + + # append the file name that we have captured to a list + if fileName: + filePathInformation.append(fileName) + + # unknown block 5 + unknownDefinition5 = file.read(unknownDefinition5Length) + #print(unknownDefinition5.hex(' ')) + + # unknown block 6 + unknownDefinition6 = file.read(unknownDefinition6Length) + #print(unknownDefinition6.hex(' ')) + + # null with length 4 + file.read(null4Length) + + # after the loop, we have 1 final area to check and that is the item type + fileType = file.read(fileTypeLength) + #print(fileType.hex(' ')) + + # #################################################################################################### + # handling the filename with the data file + + # add root dirs + filePathInformation.append(destRoot) + filePathInformation.append(mntDir) + + # first reverse the list so that it is easier to create dir structure + filePathInformation.reverse() + + # joins up all the dirs with the destination root dir. excludes filename + newDir = os.path.normpath(os.path.join(*filePathInformation[:-1])) + #print(newDir) + + newFile = os.path.normpath(os.path.join(*filePathInformation)) + #print(newFile) + + # #################################################################################################### + # FILE RECOVERY + + # if it is a directory, make that directory! + if fileType == fileTypeMappingDir: + if not os.path.exists(newFile): + print('new dir: ' + newFile) + os.makedirs(newFile) + + # if it is a file, copy that file! + if fileType == fileTypeMappingFile: + if not os.path.exists(newFile): + #if os.path.exists(newFile): + if not os.path.exists(newDir): + print('new dir: ' + newDir) + os.makedirs(newDir) + + print('old file: ' + pgDataFile) + print('new file: ' + newFile) + shutil.copyfile(pgDataFile, newFile) + + #NEW BUG ALERT: files over 4096KB are TRUNCATED! likely means that the files were ripped into pieces + # RIP MY FILES INTO PIECES + # THIS IS MY LAST RESTORE + + # if the file is 4MB, then we must also check if it has been chunked + if os.stat(pgDataFile).st_size == 4194304: + listOfPgDataFileChunkPaths = [] + + print('data file size is 4MB. checking if chunked...') + + # search for the placement group's file identifier in other placement groups + #WARNING: this process is SLOW as i have to re-scan the whole dir + #for pgDataFileUniqueIndicatorFullPaths in fuseRootDirs: + for pgDataFileUniqueIndicatorFullPaths, _, _ in fuseRootDirs: + # only walk into dirs which contain the file's unique identifier + if pgDataFileUniqueIndicator in pgDataFileUniqueIndicatorFullPaths: + listOfPgDataFileChunkPaths.append(os.path.normpath(pgDataFileUniqueIndicatorFullPaths)) + + # count the number of directories found. logic: + + # if the number of chunked files found is only 1, then it really _is_ a 4MB file + if len(listOfPgDataFileChunkPaths) == 1: + print('data file not chunked') + + # if there are more than 1 dir found, then append the data to the already-written file (above) + if len(listOfPgDataFileChunkPaths) > 1: + print('chunked data file found!') + + # iterate over the list of file chunk paths + for pgDataFileChunkPath in listOfPgDataFileChunkPaths: + + # increase the chunk iterator + pgDataFileChunk += 1 + + # convert it to a hex string (for searching the directory names) + pgDataFileChunkString = (f'{pgDataFileChunk:0>8x}') + + # loop through the list (again) and filter by current chunk + for pgDataFileChunkPath2 in listOfPgDataFileChunkPaths: + if pgDataFileChunkString in pgDataFileChunkPath2: + + # this gets the main dir of the data chunk file + pgDataChunkMainDir = os.path.normpath(os.path.dirname(pgDataFileChunkPath2)) + + # this gets the path of the dat achunk file + pgDataChunkFile = os.path.normpath(os.path.join(pgDataChunkMainDir, dataFilename)) + + #print(pgDataChunkFile) + print(pgDataFileChunkPath2) + + # read the file's chunk data file + with open(pgDataChunkFile, 'rb') as pgDataFileChunkDataFile_file: + # write to the file in append binary mode + with open(newFile, 'ab') as newFile_file: + #print('reading from: ' + pgDataChunkFile) + newFile_file.write(pgDataFileChunkDataFile_file.read())