-
Notifications
You must be signed in to change notification settings - Fork 419
[CELEBORN-2270] Fix problem with eviction to tiered storage during partition split #3610
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -49,7 +49,7 @@ class StoragePolicy(conf: CelebornConf, storageManager: StorageManager, source: | |||
| true) | ||||
| } | ||||
| } | ||||
| logError(s"Create evict file failed for ${partitionDataWriterContext.getPartitionLocation}") | ||||
| logError(s"Create evict file failed for ${partitionDataWriterContext.getPartitionLocation} - no policy for ${celebornFile.storageType.name()}") | ||||
| null | ||||
| } | ||||
|
|
||||
|
|
@@ -94,6 +94,7 @@ class StoragePolicy(conf: CelebornConf, storageManager: StorageManager, source: | |||
| } | ||||
|
|
||||
| def tryCreateFileByType(storageInfoType: StorageInfo.Type): TierWriterBase = { | ||||
| val overrideType = if (evict) storageInfoType else location.getStorageInfo.getType | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In a scenario where eviction does not occur, should overrideType also be storageInfoType?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did it this way in order to not change the behavior of the code on the this is code is really tricky, because it handles two things at once:
in theory the value for location.getStorageInfo.getType should be the same as storageInfoType when evict=false, but not for MEMORY code is here: celeborn/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StoragePolicy.scala Line 198 in 11b054a
I am not still super familiar with this code (even if I debugged it many many times 🙄 ) and I did not want to break it
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suppose location.getStorageInfo.getType is set to HDD and createFileOrder is [HDD, S3], if the local HDD fails to create a file, storageInfoType will switch to S3. See following code segment: However, when storageInfoType is set to S3, the method createDiskFile still uses location.getStorageInfo.getType as HDD, which seems incorrect. Therefore, StorageType seems should always be overrideType. |
||||
| try { | ||||
| storageInfoType match { | ||||
| case StorageInfo.Type.MEMORY => | ||||
|
|
@@ -118,19 +119,24 @@ class StoragePolicy(conf: CelebornConf, storageManager: StorageManager, source: | |||
| partitionDataWriterContext, | ||||
| storageManager) | ||||
| } else { | ||||
| logWarning( | ||||
| s"Not creating ${storageInfoType} file from ${location.getStorageInfo.getType} for ${partitionDataWriterContext.getShuffleKey} ${partitionDataWriterContext.getPartitionLocation.getFileName}") | ||||
| null | ||||
| } | ||||
| case StorageInfo.Type.HDD | StorageInfo.Type.SSD | StorageInfo.Type.HDFS | StorageInfo.Type.OSS | StorageInfo.Type.S3 => | ||||
| if (storageManager.localOrDfsStorageAvailable) { | ||||
| logDebug(s"create non-memory file for ${partitionDataWriterContext.getShuffleKey} ${partitionDataWriterContext.getPartitionLocation.getFileName}") | ||||
| logDebug( | ||||
| s"create non-memory file type $storageInfoType (evict=$evict, override=$overrideType) for ${partitionDataWriterContext.getShuffleKey} ${partitionDataWriterContext.getPartitionLocation.getFileName}") | ||||
| val (flusher, diskFileInfo, workingDir) = storageManager.createDiskFile( | ||||
| location, | ||||
| partitionDataWriterContext.getAppId, | ||||
| partitionDataWriterContext.getShuffleId, | ||||
| location.getFileName, | ||||
| partitionDataWriterContext.getUserIdentifier, | ||||
| partitionDataWriterContext.getPartitionType, | ||||
| partitionDataWriterContext.isPartitionSplitEnabled) | ||||
| partitionDataWriterContext.isPartitionSplitEnabled, | ||||
| overrideType // this is different from location type, in case of eviction | ||||
| ) | ||||
| partitionDataWriterContext.setWorkingDir(workingDir) | ||||
| val metaHandler = getPartitionMetaHandler(diskFileInfo) | ||||
| if (flusher.isInstanceOf[LocalFlusher] | ||||
|
|
@@ -167,7 +173,9 @@ class StoragePolicy(conf: CelebornConf, storageManager: StorageManager, source: | |||
| } | ||||
| } catch { | ||||
| case e: Exception => | ||||
| logError(s"create celeborn file for storage $storageInfoType failed", e) | ||||
| logError( | ||||
| s"create celeborn file for storage $storageInfoType failed for ${partitionDataWriterContext.getShuffleKey} ${partitionDataWriterContext.getPartitionLocation.getFileName}", | ||||
| e) | ||||
| null | ||||
| } | ||||
| } | ||||
|
|
@@ -193,7 +201,8 @@ class StoragePolicy(conf: CelebornConf, storageManager: StorageManager, source: | |||
| } | ||||
|
|
||||
| val maxSize = order.get.length | ||||
| for (i <- tryCreateFileTypeIndex until maxSize) { | ||||
| val firstIndex = tryCreateFileTypeIndex | ||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why change this? The change is same as the current implementation.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because the value of "firstIndex" is logged below |
||||
| for (i <- firstIndex until maxSize) { | ||||
| val storageStr = order.get(i) | ||||
| val storageInfoType = StorageInfo.fromStrToType(storageStr) | ||||
| val file = tryCreateFileByType(storageInfoType) | ||||
|
|
@@ -203,9 +212,9 @@ class StoragePolicy(conf: CelebornConf, storageManager: StorageManager, source: | |||
| } | ||||
|
|
||||
| logError( | ||||
| s"Could not create file for storage type ${location.getStorageInfo.getType}") | ||||
| s"Could not create file for storage type ${location.getStorageInfo.getType}, tried ${order.get} firstIndex $firstIndex for ${partitionDataWriterContext.getShuffleKey} ${partitionDataWriterContext.getPartitionLocation.getFileName}") | ||||
|
|
||||
| throw new CelebornIOException( | ||||
| s"Create file failed for context ${partitionDataWriterContext.toString}") | ||||
| s"Create file failed for context ${partitionDataWriterContext.toString}, tried ${order.get} firstIndex $firstIndex") | ||||
| } | ||||
| } | ||||
Uh oh!
There was an error while loading. Please reload this page.