You've already forked ceph-osd-file-extractor
Update scrape.py
This commit is contained in:
346
scrape.py
346
scrape.py
@@ -1,30 +1,52 @@
|
|||||||
import os
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
# ####################################################################################################
|
# ####################################################################################################
|
||||||
# some alignment vars
|
# some alignment vars
|
||||||
offsetRelativeAddressCorrection = 0x01
|
|
||||||
|
|
||||||
startOfFirstFilenameDefinitionAddress = 0x12
|
|
||||||
filenameDefinitionLength = 0x02
|
filenameDefinitionLength = 0x02
|
||||||
relativeFilePermissionsAddress = 0x01 - offsetRelativeAddressCorrection
|
unknownDefinition1Length = 0x02
|
||||||
filePermissionsLength = 0x02
|
null2Length = 0x02
|
||||||
relativeFileDateAddress = 0x03 - offsetRelativeAddressCorrection
|
unknownDefinition2Length = 0x04
|
||||||
fileDateLength = 0x04
|
null1Length = 0x01
|
||||||
relativeTrueAddress = 0x02 - offsetRelativeAddressCorrection
|
continueReadingLength = 0x01
|
||||||
trueLength = 0x01
|
#null2Length = 0x02
|
||||||
relativeFileNameLengthAddress = 0x03 - offsetRelativeAddressCorrection
|
numberOfDirectoryNamesLength = 0x01
|
||||||
|
null3Length = 0x03
|
||||||
|
#filenameDefinitionLength = 0x02
|
||||||
|
unknownDefinition3Length = 0x02
|
||||||
|
#null2Length = 0x02
|
||||||
|
unknownDefinition4Length = 0x04
|
||||||
|
#null1Length = 0x01
|
||||||
|
#continueReadingLength = 0x01
|
||||||
|
#null2Length = 0x02
|
||||||
fileNameLengthLength = 0x01
|
fileNameLengthLength = 0x01
|
||||||
relativeFileNameAddress = 0x04 - offsetRelativeAddressCorrection
|
#null3Length = 0x03
|
||||||
#fileNameLength = 0x01 # this is captured dynamically within the _parent file read loop
|
#fileNameLength = 0x??
|
||||||
relativeNextFileDefinitionAddress = 0x09 - offsetRelativeAddressCorrection
|
unknownDefinition5Length = 0x02
|
||||||
|
unknownDefinition6Length = 0x02
|
||||||
|
null4Length = 0x04
|
||||||
|
fileTypeLength = 0x01
|
||||||
|
|
||||||
|
# this to help convert a byte to a bool (b'\x00' = 0, else = 1)
|
||||||
|
byteCompareForBool = b'\x00'
|
||||||
|
|
||||||
|
# when reading the file type, these are the types ive observed
|
||||||
|
fileTypeMappingFile = b'\x05'
|
||||||
|
fileTypeMappingDir = b'\x04'
|
||||||
|
|
||||||
# ####################################################################################################
|
# ####################################################################################################
|
||||||
# walk through the filesystem
|
# walk through the filesystem
|
||||||
|
#testFileAttrName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/attr/_parent'
|
||||||
|
#testFileDataName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/data'
|
||||||
testFileAttrName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/attr/_parent'
|
testFileAttrName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/attr/_parent'
|
||||||
testFileDataName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/data'
|
testFileDataName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/data'
|
||||||
|
|
||||||
# fuse mounted osd path
|
# fuse mounted osd path
|
||||||
|
#testRoot = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#'
|
||||||
|
testRoot = '/mnt/test/5.7f_head/all/#5:fea38466:::100028f0fe1.00000000:head#'
|
||||||
fuseRoot = '/mnt/test'
|
fuseRoot = '/mnt/test'
|
||||||
|
|
||||||
# destination dirs
|
# destination dirs
|
||||||
@@ -39,111 +61,243 @@ relativeFolderStructureDir = '/attr'
|
|||||||
folderStructureFile = '_parent'
|
folderStructureFile = '_parent'
|
||||||
dataFilename = 'data'
|
dataFilename = 'data'
|
||||||
|
|
||||||
|
# placement group indicator extractor
|
||||||
|
placementGroupNameRegex = '(#.*?):+(.*?):+(.*?)\.(.*?):+(.*?)#$'
|
||||||
|
# g1 g2 g3 g4 g5
|
||||||
|
|
||||||
|
# determine number of paths to loop
|
||||||
|
#fuseRootDirs = os.walk(testRoot)
|
||||||
|
fuseRootDirs = os.walk(fuseRoot)
|
||||||
|
|
||||||
# walk through the fuse-mounted OSD
|
# walk through the fuse-mounted OSD
|
||||||
for fullPaths, dirNames, fileNames in os.walk(fuseRoot):
|
#for fullPaths in fuseRootDirs:
|
||||||
|
for fullPaths, _, _ in fuseRootDirs:
|
||||||
# dont walk into excluded dirs
|
# dont walk into excluded dirs
|
||||||
if not exclusionDirs in fullPaths:
|
if not exclusionDirs in fullPaths:
|
||||||
# only walk into dirs which are attr dir
|
# only walk into dirs which are attr dir
|
||||||
if fullPaths.endswith(relativeFolderStructureDir):
|
if fullPaths.endswith(relativeFolderStructureDir):
|
||||||
# at this point we've got the dirs we want
|
# at this point we've got the dirs we want
|
||||||
|
|
||||||
# now we can walk up 1 dir for the (assumed) placement group's main dir
|
# now we can walk up 1 dir for the (assumed) placement group's main dir
|
||||||
pgMainDir = os.path.normpath(os.path.dirname(fullPaths))
|
pgMainDir = os.path.normpath(os.path.dirname(fullPaths))
|
||||||
|
|
||||||
# join up the main dir with the folder structure file
|
# join up the main dir with the folder structure file
|
||||||
pgFolderStructureFile = os.path.normpath(os.path.join(pgMainDir, relativeFolderStructureDir[1:], folderStructureFile))
|
pgFolderStructureFile = os.path.normpath(os.path.join(pgMainDir, relativeFolderStructureDir[1:], folderStructureFile))
|
||||||
|
|
||||||
# only proceed if the folder structure file exists
|
# extract the file's unique reference
|
||||||
if os.path.exists(pgFolderStructureFile):
|
pgDataFileRegexSearch = re.search(placementGroupNameRegex, Path(pgFolderStructureFile).parent.parent.name)
|
||||||
# join up the main dir with the data file
|
|
||||||
pgDataFile = os.path.normpath(os.path.join(pgMainDir, dataFilename))
|
# only proceed if a match has been made. this means that there _is_ a data file to grab
|
||||||
|
if pgDataFileRegexSearch:
|
||||||
|
pgDataFileUniqueIndicator = pgDataFileRegexSearch.group(3)
|
||||||
|
pgDataFileChunk = int(pgDataFileRegexSearch.group(4), 16)
|
||||||
|
|
||||||
# only proceed if the data file exists
|
# only proceed if the folder structure file exists
|
||||||
if os.path.exists(pgDataFile):
|
if os.path.exists(pgFolderStructureFile):
|
||||||
# ####################################################################################################
|
# join up the main dir with the data file
|
||||||
# running the loop to scrape the file/folder structure
|
pgDataFile = os.path.normpath(os.path.join(pgMainDir, dataFilename))
|
||||||
|
|
||||||
# empty list for saving the path details
|
# only proceed if the data file exists
|
||||||
filePathInformation = []
|
if os.path.exists(pgDataFile):
|
||||||
|
# ####################################################################################################
|
||||||
# open the file readonly as a binary file
|
# running the loop to scrape the file/folder structure
|
||||||
with open(pgFolderStructureFile, mode='rb') as file:
|
|
||||||
# get EOF seek address - seek to 0 bytes from the end of file (2)
|
|
||||||
file.seek(0, 2)
|
|
||||||
eofAddress = file.tell()
|
|
||||||
|
|
||||||
# at the start of the file, we will load up the first address
|
# empty list for saving the path details
|
||||||
file.seek(startOfFirstFilenameDefinitionAddress, 0)
|
filePathInformation = []
|
||||||
|
|
||||||
# seek through the binary file until our seek cursor is at the end of the file
|
# open the file readonly as a binary file
|
||||||
while(file.tell() < eofAddress):
|
with open(pgFolderStructureFile, mode='rb') as file:
|
||||||
file.read(filenameDefinitionLength).hex(' ')
|
# get EOF seek address - seek to 0 bytes from the end of file (2)
|
||||||
#print(file.read(filenameDefinitionLength).hex(' '))
|
file.seek(0, 2)
|
||||||
#todo: ensure that this value is 0x02 0x02
|
eofAddress = file.tell()
|
||||||
|
|
||||||
#todo: not sure if this the permission value
|
# return to beginning of file - seek to 0 bytes from the beginning of file (0)
|
||||||
file.seek(relativeFilePermissionsAddress, 1)
|
file.seek(0, 0)
|
||||||
file.read(filePermissionsLength).hex(' ')
|
|
||||||
#print(file.read(filePermissionsLength).hex(' '))
|
|
||||||
|
|
||||||
#todo: not sure if this is the date value
|
# _parent file header
|
||||||
file.seek(relativeFileDateAddress, 1)
|
startDefinition = file.read(filenameDefinitionLength)
|
||||||
file.read(fileDateLength).hex(' ')
|
#print(startDefinition.hex(' '))
|
||||||
#print(file.read(fileDateLength).hex(' '))
|
#todo: ensure that this value is 0x05 0x04
|
||||||
|
|
||||||
#todo: this _appears_ to be always true byte (ie. 0x01). perhaps it is a alignment byte?
|
# unknown block 1
|
||||||
file.seek(relativeTrueAddress, 1)
|
unknownDefinition1 = file.read(unknownDefinition1Length)
|
||||||
file.read(trueLength).hex(' ')
|
#print(unknownDefinition1.hex(' '))
|
||||||
#print(file.read(trueLength).hex(' '))
|
|
||||||
#todo: ensure that this value is 0x01
|
|
||||||
|
|
||||||
file.seek(relativeFileNameLengthAddress, 1)
|
# null with length 2
|
||||||
fileNameLength = file.read(fileNameLengthLength)
|
file.read(null2Length)
|
||||||
#print(fileNameLength.hex(' '))
|
|
||||||
|
|
||||||
file.seek(relativeFileNameAddress, 1)
|
# unknown block 2
|
||||||
fileName = file.read(int.from_bytes(fileNameLength)).decode('utf-8')
|
unknownDefinition2 = file.read(unknownDefinition2Length)
|
||||||
#print(fileName)
|
#print(unknownDefinition2.hex(' '))
|
||||||
|
|
||||||
# append the file name that we have captured to a list
|
# null with length 1
|
||||||
if fileName:
|
file.read(null1Length)
|
||||||
filePathInformation.append(fileName)
|
|
||||||
|
|
||||||
# move to the next file/dir name definition
|
# continue reading byte
|
||||||
file.seek(relativeNextFileDefinitionAddress, 1)
|
continueReading = file.read(continueReadingLength) != byteCompareForBool
|
||||||
|
#print(continueReading)
|
||||||
# ####################################################################################################
|
#todo: it should always be 0x01 here as we are in the header
|
||||||
# handling the filename with the data file
|
|
||||||
|
|
||||||
# add root dirs
|
|
||||||
filePathInformation.append(destRoot)
|
|
||||||
filePathInformation.append(mntDir)
|
|
||||||
|
|
||||||
# first reverse the list so that it is easier to create dir structure
|
|
||||||
filePathInformation.reverse()
|
|
||||||
|
|
||||||
# joins up all the dirs with the destination root dir. excludes filename
|
|
||||||
newDir = os.path.normpath(os.path.join(*filePathInformation[:-1]))
|
|
||||||
#print(newDir)
|
|
||||||
|
|
||||||
newFile = os.path.normpath(os.path.join(*filePathInformation))
|
|
||||||
#print(newFile)
|
|
||||||
|
|
||||||
# ####################################################################################################
|
|
||||||
# FILE RECOVERY
|
|
||||||
|
|
||||||
#BUG: a data file can be zero bytes. however i found that in the OSD, a zero byte data file can also be an empty folder
|
|
||||||
# in this case, ill skip them. by inverting the condition, i can process them if i find that im missing files
|
|
||||||
|
|
||||||
# skip files data files that are 0 bytes
|
|
||||||
if not os.stat(pgDataFile).st_size == 0:
|
|
||||||
# make that dir
|
|
||||||
if not os.path.exists(newDir):
|
|
||||||
print('new dir: ' + newDir)
|
|
||||||
os.makedirs(newDir)
|
|
||||||
|
|
||||||
# copy the data file to the fullpath file
|
# null with length 2
|
||||||
if not os.path.exists(newFile):
|
file.read(null2Length)
|
||||||
print('old file: ' + pgDataFile)
|
|
||||||
print('new file: ' + newFile)
|
# number of directory names in path
|
||||||
shutil.copyfile(pgDataFile, newFile)
|
numberOfDirectoryNames = file.read(numberOfDirectoryNamesLength)
|
||||||
|
#print(numberOfDirectoryNames.hex(' '))
|
||||||
|
|
||||||
|
# null with length 3
|
||||||
|
file.read(null3Length)
|
||||||
|
|
||||||
|
# seek through the binary file until our seek cursor is at the end of the file
|
||||||
|
#while(file.tell() < eofAddress):
|
||||||
|
while(file.tell() < eofAddress and continueReading):
|
||||||
|
# file header
|
||||||
|
startFileDefinition = file.read(filenameDefinitionLength)
|
||||||
|
#print(startFileDefinition.hex(' '))
|
||||||
|
#todo: ensure that this value is 0x02 0x02
|
||||||
|
|
||||||
|
# unknown block 3
|
||||||
|
unknownDefinition3 = file.read(unknownDefinition3Length)
|
||||||
|
#print(unknownDefinition3.hex(' '))
|
||||||
|
|
||||||
|
# null with length 2
|
||||||
|
file.read(null2Length)
|
||||||
|
|
||||||
|
# unknown block 4
|
||||||
|
unknownDefinition4 = file.read(unknownDefinition4Length)
|
||||||
|
#print(unknownDefinition4.hex(' '))
|
||||||
|
|
||||||
|
# null with length 1
|
||||||
|
file.read(null1Length)
|
||||||
|
|
||||||
|
# continue reading byte
|
||||||
|
continueReading = file.read(continueReadingLength) != byteCompareForBool
|
||||||
|
#print(continueReading)
|
||||||
|
# this should stop the loop after reading the last file path
|
||||||
|
|
||||||
|
# null with length 2
|
||||||
|
file.read(null2Length)
|
||||||
|
|
||||||
|
# length of the filename
|
||||||
|
fileNameLength = file.read(fileNameLengthLength)
|
||||||
|
#print(fileNameLength.hex(' '))
|
||||||
|
|
||||||
|
# null with length 3
|
||||||
|
file.read(null3Length)
|
||||||
|
|
||||||
|
# this is the filename. read length is based on above length of filename
|
||||||
|
fileName = file.read(int.from_bytes(fileNameLength)).decode('utf-8')
|
||||||
|
#print(fileName)
|
||||||
|
|
||||||
|
# append the file name that we have captured to a list
|
||||||
|
if fileName:
|
||||||
|
filePathInformation.append(fileName)
|
||||||
|
|
||||||
|
# unknown block 5
|
||||||
|
unknownDefinition5 = file.read(unknownDefinition5Length)
|
||||||
|
#print(unknownDefinition5.hex(' '))
|
||||||
|
|
||||||
|
# unknown block 6
|
||||||
|
unknownDefinition6 = file.read(unknownDefinition6Length)
|
||||||
|
#print(unknownDefinition6.hex(' '))
|
||||||
|
|
||||||
|
# null with length 4
|
||||||
|
file.read(null4Length)
|
||||||
|
|
||||||
|
# after the loop, we have 1 final area to check and that is the item type
|
||||||
|
fileType = file.read(fileTypeLength)
|
||||||
|
#print(fileType.hex(' '))
|
||||||
|
|
||||||
|
# ####################################################################################################
|
||||||
|
# handling the filename with the data file
|
||||||
|
|
||||||
|
# add root dirs
|
||||||
|
filePathInformation.append(destRoot)
|
||||||
|
filePathInformation.append(mntDir)
|
||||||
|
|
||||||
|
# first reverse the list so that it is easier to create dir structure
|
||||||
|
filePathInformation.reverse()
|
||||||
|
|
||||||
|
# joins up all the dirs with the destination root dir. excludes filename
|
||||||
|
newDir = os.path.normpath(os.path.join(*filePathInformation[:-1]))
|
||||||
|
#print(newDir)
|
||||||
|
|
||||||
|
newFile = os.path.normpath(os.path.join(*filePathInformation))
|
||||||
|
#print(newFile)
|
||||||
|
|
||||||
|
# ####################################################################################################
|
||||||
|
# FILE RECOVERY
|
||||||
|
|
||||||
|
# if it is a directory, make that directory!
|
||||||
|
if fileType == fileTypeMappingDir:
|
||||||
|
if not os.path.exists(newFile):
|
||||||
|
print('new dir: ' + newFile)
|
||||||
|
os.makedirs(newFile)
|
||||||
|
|
||||||
|
# if it is a file, copy that file!
|
||||||
|
if fileType == fileTypeMappingFile:
|
||||||
|
if not os.path.exists(newFile):
|
||||||
|
#if os.path.exists(newFile):
|
||||||
|
if not os.path.exists(newDir):
|
||||||
|
print('new dir: ' + newDir)
|
||||||
|
os.makedirs(newDir)
|
||||||
|
|
||||||
|
print('old file: ' + pgDataFile)
|
||||||
|
print('new file: ' + newFile)
|
||||||
|
shutil.copyfile(pgDataFile, newFile)
|
||||||
|
|
||||||
|
#NEW BUG ALERT: files over 4096KB are TRUNCATED! likely means that the files were ripped into pieces
|
||||||
|
# RIP MY FILES INTO PIECES
|
||||||
|
# THIS IS MY LAST RESTORE
|
||||||
|
|
||||||
|
# if the file is 4MB, then we must also check if it has been chunked
|
||||||
|
if os.stat(pgDataFile).st_size == 4194304:
|
||||||
|
listOfPgDataFileChunkPaths = []
|
||||||
|
|
||||||
|
print('data file size is 4MB. checking if chunked...')
|
||||||
|
|
||||||
|
# search for the placement group's file identifier in other placement groups
|
||||||
|
#WARNING: this process is SLOW as i have to re-scan the whole dir
|
||||||
|
#for pgDataFileUniqueIndicatorFullPaths in fuseRootDirs:
|
||||||
|
for pgDataFileUniqueIndicatorFullPaths, _, _ in fuseRootDirs:
|
||||||
|
# only walk into dirs which contain the file's unique identifier
|
||||||
|
if pgDataFileUniqueIndicator in pgDataFileUniqueIndicatorFullPaths:
|
||||||
|
listOfPgDataFileChunkPaths.append(os.path.normpath(pgDataFileUniqueIndicatorFullPaths))
|
||||||
|
|
||||||
|
# count the number of directories found. logic:
|
||||||
|
|
||||||
|
# if the number of chunked files found is only 1, then it really _is_ a 4MB file
|
||||||
|
if len(listOfPgDataFileChunkPaths) == 1:
|
||||||
|
print('data file not chunked')
|
||||||
|
|
||||||
|
# if there are more than 1 dir found, then append the data to the already-written file (above)
|
||||||
|
if len(listOfPgDataFileChunkPaths) > 1:
|
||||||
|
print('chunked data file found!')
|
||||||
|
|
||||||
|
# iterate over the list of file chunk paths
|
||||||
|
for pgDataFileChunkPath in listOfPgDataFileChunkPaths:
|
||||||
|
|
||||||
|
# increase the chunk iterator
|
||||||
|
pgDataFileChunk += 1
|
||||||
|
|
||||||
|
# convert it to a hex string (for searching the directory names)
|
||||||
|
pgDataFileChunkString = (f'{pgDataFileChunk:0>8x}')
|
||||||
|
|
||||||
|
# loop through the list (again) and filter by current chunk
|
||||||
|
for pgDataFileChunkPath2 in listOfPgDataFileChunkPaths:
|
||||||
|
if pgDataFileChunkString in pgDataFileChunkPath2:
|
||||||
|
|
||||||
|
# this gets the main dir of the data chunk file
|
||||||
|
pgDataChunkMainDir = os.path.normpath(os.path.dirname(pgDataFileChunkPath2))
|
||||||
|
|
||||||
|
# this gets the path of the dat achunk file
|
||||||
|
pgDataChunkFile = os.path.normpath(os.path.join(pgDataChunkMainDir, dataFilename))
|
||||||
|
|
||||||
|
#print(pgDataChunkFile)
|
||||||
|
print(pgDataFileChunkPath2)
|
||||||
|
|
||||||
|
# read the file's chunk data file
|
||||||
|
with open(pgDataChunkFile, 'rb') as pgDataFileChunkDataFile_file:
|
||||||
|
# write to the file in append binary mode
|
||||||
|
with open(newFile, 'ab') as newFile_file:
|
||||||
|
#print('reading from: ' + pgDataChunkFile)
|
||||||
|
newFile_file.write(pgDataFileChunkDataFile_file.read())
|
||||||
|
|||||||
Reference in New Issue
Block a user