From f8fc279821d1fbea41a4611110a2117070d1af65 Mon Sep 17 00:00:00 2001 From: chucklz <9289341+chucklz1515@users.noreply.github.com> Date: Fri, 31 Jan 2025 10:11:35 -0500 Subject: [PATCH] Create scrape.py --- scrape.py | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 scrape.py diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..8a172e4 --- /dev/null +++ b/scrape.py @@ -0,0 +1,143 @@ +import os +import shutil + +# #################################################################################################### +# some alignment vars +offsetRelativeAddressCorrection = 0x01 + +startOfFirstFilenameDefinitionAddress = 0x12 +filenameDefinitionLength = 0x02 +relativeFilePermissionsAddress = 0x01 - offsetRelativeAddressCorrection +filePermissionsLength = 0x02 +relativeFileDateAddress = 0x03 - offsetRelativeAddressCorrection +fileDateLength = 0x04 +relativeTrueAddress = 0x02 - offsetRelativeAddressCorrection +trueLength = 0x01 +relativeFileNameLengthAddress = 0x03 - offsetRelativeAddressCorrection +fileNameLengthLength = 0x01 +relativeFileNameAddress = 0x04 - offsetRelativeAddressCorrection +#fileNameLength = 0x01 # this is captured dynamically within the _parent file read loop +relativeNextFileDefinitionAddress = 0x09 - offsetRelativeAddressCorrection + +# #################################################################################################### +# walk through the filesystem + +# these are test paths +testFileAttrName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/attr/_parent' +testFileDataName = '/mnt/test/5.f_head/all/#5:f1988779:::10002413871.00000000:head#/data' + +# fuse mounted osd path +fuseRoot = '/mnt/test' + +# destination dirs +mntDir = '/mnt' +destRoot = 'ceph-fs-storage' + +# exclusions. mostly to exclude the metadata dir +exclusionDirs = ('/mnt/test/meta') + +# for selecting the folder and grabbing relative files +relativeFolderStructureDir = '/attr' +folderStructureFile = '_parent' +dataFilename = 'data' + +# walk through the fuse-mounted OSD +for fullPaths, dirNames, fileNames in os.walk(fuseRoot): + # dont walk into excluded dirs + if not exclusionDirs in fullPaths: + # only walk into dirs which are attr dir + if fullPaths.endswith(relativeFolderStructureDir): + # at this point we've got the dirs we want + + # now we can walk up 1 dir for the (assumed) placement group's main dir + pgMainDir = os.path.normpath(os.path.dirname(fullPaths)) + + # join up the main dir with the folder structure file + pgFolderStructureFile = os.path.normpath(os.path.join(pgMainDir, relativeFolderStructureDir[1:], folderStructureFile)) + + # only proceed if the folder structure file exists + if os.path.exists(pgFolderStructureFile): + # join up the main dir with the data file + pgDataFile = os.path.normpath(os.path.join(pgMainDir, dataFilename)) + + # only proceed if the data file exists + if os.path.exists(pgDataFile): + # #################################################################################################### + # running the loop to scrape the file/folder structure + + # empty list for saving the path details + filePathInformation = [] + + # open the file readonly as a binary file + with open(pgFolderStructureFile, mode='rb') as file: + # get EOF seek address - seek to 0 bytes from the end of file (2) + file.seek(0, 2) + eofAddress = file.tell() + + # at the start of the file, we will load up the first address + file.seek(startOfFirstFilenameDefinitionAddress, 0) + + # seek through the binary file until our seek cursor is at the end of the file + while(file.tell() < eofAddress): + file.read(filenameDefinitionLength).hex(' ') + #print(file.read(filenameDefinitionLength).hex(' ')) + #todo: ensure that this value is 0x02 0x02 + + #todo: not sure if this the permission value + file.seek(relativeFilePermissionsAddress, 1) + file.read(filePermissionsLength).hex(' ') + #print(file.read(filePermissionsLength).hex(' ')) + + #todo: not sure if this is the date value + file.seek(relativeFileDateAddress, 1) + file.read(fileDateLength).hex(' ') + #print(file.read(fileDateLength).hex(' ')) + + #todo: this _appears_ to be always true byte (ie. 0x01). perhaps it is a alignment byte? + file.seek(relativeTrueAddress, 1) + file.read(trueLength).hex(' ') + #print(file.read(trueLength).hex(' ')) + #todo: ensure that this value is 0x01 + + file.seek(relativeFileNameLengthAddress, 1) + fileNameLength = file.read(fileNameLengthLength) + #print(fileNameLength.hex(' ')) + + file.seek(relativeFileNameAddress, 1) + fileName = file.read(int.from_bytes(fileNameLength)).decode('utf-8') + #print(fileName) + + # append the file name that we have captured to a list + if fileName: + filePathInformation.append(fileName) + + # move to the next file/dir name definition + file.seek(relativeNextFileDefinitionAddress, 1) + + # #################################################################################################### + # handling the filename with the data file + + # add root dirs + filePathInformation.append(destRoot) + filePathInformation.append(mntDir) + + # first reverse the list so that it is easier to create dir structure + filePathInformation.reverse() + + # joins up all the dirs with the destination root dir. excludes filename + newDir = os.path.normpath(os.path.join(*filePathInformation[:-1])) + #print(newDir) + + newFile = os.path.normpath(os.path.join(*filePathInformation)) + #print(newFile) + + # #################################################################################################### + # FILE RECOVERY + + # make that dir + if not os.path.exists(newDir): + os.makedirs(newDir) + + # copy the data file to the fullpath file + if not os.path.exists(newFile): + shutil.copyfile(pgDataFile, newFile)