From 719c8bd3208ff0508bd17db6223ed48f5fc787d5 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Wed, 14 Sep 2022 14:46:09 -0400
Subject: [PATCH 01/17] added new viz api

---
 flaml/automl.py               | 9 +++++++++
 flaml/searcher/blendsearch.py | 4 ++--
 test.py                       | 5 +++++
 3 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 test.py

diff --git a/flaml/automl.py b/flaml/automl.py
index 78ea15c4f5..f813482a78 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2930,6 +2930,15 @@ def is_to_reverse_metric(metric, task):
             del self._state.groups, self._state.groups_all, self._state.groups_val
         logger.setLevel(old_level)
 
+    def viz(self, wordage, value):
+        print(wordage)
+        if value == 0:
+            print("This shows that the new API for visualizations works correctly")
+        else:
+            print("Just a basic if else statement")
+
+
+
     def _search_parallel(self):
         try:
             from ray import __version__ as ray_version
diff --git a/flaml/searcher/blendsearch.py b/flaml/searcher/blendsearch.py
index 54db6e37f0..a0ce357ed0 100644
--- a/flaml/searcher/blendsearch.py
+++ b/flaml/searcher/blendsearch.py
@@ -11,8 +11,8 @@
     from ray import __version__ as ray_version
 
     assert ray_version >= "1.10.0"
-    from ray.tune.suggest import Searcher
-    from ray.tune.suggest.optuna import OptunaSearch as GlobalSearch
+    from ray.tune.search import Searcher
+    from ray.tune.search.optuna import OptunaSearch as GlobalSearch
 except (ImportError, AssertionError):
     from .suggestion import Searcher
     from .suggestion import OptunaSearch as GlobalSearch
diff --git a/test.py b/test.py
new file mode 100644
index 0000000000..497d7d87a3
--- /dev/null
+++ b/test.py
@@ -0,0 +1,5 @@
+
+Wizzy = "This is a great visualization"
+from flaml import AutoML
+automl = AutoML()
+automl.viz(Wizzy, 0)

From 8f130b86fb6cdd065f7a32f9f9cccb4051e9e406 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Wed, 28 Sep 2022 14:18:16 -0400
Subject: [PATCH 02/17] if statement added for different plots

---
 flaml/automl.py | 29 +++++++++++++++++++++++------
 test.py         | 46 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index 9ad5862abf..a1aab8a216 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -20,6 +20,7 @@
 from sklearn.utils import shuffle
 from sklearn.base import BaseEstimator
 import pandas as pd
+import matplotlib.pyplot as plt
 import logging
 import json
 from .ml import (
@@ -2930,12 +2931,28 @@ def is_to_reverse_metric(metric, task):
             del self._state.groups, self._state.groups_all, self._state.groups_val
         logger.setLevel(old_level)
 
-    def viz(self, wordage, value):
-        print(wordage)
-        if value == 0:
-            print("This shows that the new API for visualizations works correctly")
-        else:
-            print("Just a basic if else statement")
+    def viz(self, 
+            title = None, 
+            xlab = None,
+            ylab = None,
+            plottype = None,
+            time_history = None,
+            valid_loss_history = None,
+            best_valid_loss_history = None,
+            ):
+        if plottype == "scatter":
+            plt.title(title)
+            plt.xlabel(xlab)
+            plt.ylabel(ylab)
+            plt.scatter(time_history, 1 - np.array(valid_loss_history))
+            plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
+            plt.show()
+        elif plottype == "feature":
+            print("working 1")
+        elif plottype == "Model":
+            print("best model b")
+        elif plottype == "parameters":
+            print("dees the best")
 
 
 
diff --git a/test.py b/test.py
index 497d7d87a3..e0080e5adf 100644
--- a/test.py
+++ b/test.py
@@ -1,5 +1,45 @@
-
-Wizzy = "This is a great visualization"
 from flaml import AutoML
+from flaml.data import load_openml_dataset
+from flaml.data import get_output_from_log
+import matplotlib.pyplot as plt
+
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')
+print("Data type:", type(X_train), type(y_train))
+print("The first 5 rows of X_train:")
+print(X_train.head())
+print("The first 5 rows of y_train:")
+print(y_train.head()) 
+
 automl = AutoML()
-automl.viz(Wizzy, 0)
+
+settings = {
+    "time_budget": 60,  # total running time in seconds
+    "metric": 'accuracy',  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
+                           # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
+    "task": 'classification',  # task type
+    "log_file_name": 'airlines_experiment.log',  # flaml log file
+    "seed": 7654321,    # random seed
+}
+
+automl.fit(X_train=X_train, y_train=y_train, **settings)
+
+ 
+time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
+    get_output_from_log(filename=settings['log_file_name'], time_budget=240)
+for config in config_history:
+    print(config)
+
+
+
+# ATMSeer: increasing transparency and controllability in automated ml page 4
+# Whither Automl? Understanding the role of automation in machine learning
+# Which parameter is more important, which models are working the best dataset  
+
+t = "Learning Curve"
+xl = "Wall Clock Time (s)"
+yl = "Validation Accuracy"
+pt = "scatter"
+bvlh = best_valid_loss_history
+vlh = valid_loss_history
+th = time_history
+automl.viz(title = t, xlab = xl, ylab = yl, plottype = pt, time_history = th, valid_loss_history = vlh, best_valid_loss_history = bvlh)
\ No newline at end of file

From 4ca305e8dac4f712f451b6dfa8c6156b6956e502 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 10 Oct 2022 14:51:16 -0400
Subject: [PATCH 03/17] Small Updates

---
 flaml/automl.py |  6 +++++-
 test.py         | 13 +++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index a1aab8a216..ed5f3fbb84 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2948,10 +2948,14 @@ def viz(self,
             plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
             plt.show()
         elif plottype == "feature":
-            print("working 1")
+            plt.title(title)
+            plt.barh(self.feature_names_in_, self.feature_importances_)
+            plt.show()
         elif plottype == "Model":
+            # pie graph that shows percentage of each model with the two best
             print("best model b")
         elif plottype == "parameters":
+            # ANOVA 
             print("dees the best")
 
 
diff --git a/test.py b/test.py
index e0080e5adf..b9e5bb108a 100644
--- a/test.py
+++ b/test.py
@@ -2,6 +2,7 @@
 from flaml.data import load_openml_dataset
 from flaml.data import get_output_from_log
 import matplotlib.pyplot as plt
+import statsmodels.api as sm
 
 X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')
 print("Data type:", type(X_train), type(y_train))
@@ -28,18 +29,22 @@
     get_output_from_log(filename=settings['log_file_name'], time_budget=240)
 for config in config_history:
     print(config)
-
-
+print(metric_history)
 
 # ATMSeer: increasing transparency and controllability in automated ml page 4
 # Whither Automl? Understanding the role of automation in machine learning
 # Which parameter is more important, which models are working the best dataset  
+# XAutoML A visual analytics tools for establishing trust
+
 
 t = "Learning Curve"
 xl = "Wall Clock Time (s)"
 yl = "Validation Accuracy"
-pt = "scatter"
+pt = "feature"
 bvlh = best_valid_loss_history
 vlh = valid_loss_history
 th = time_history
-automl.viz(title = t, xlab = xl, ylab = yl, plottype = pt, time_history = th, valid_loss_history = vlh, best_valid_loss_history = bvlh)
\ No newline at end of file
+automl.viz(t, xl, yl, pt)
+
+aov_table = sm.stats.anova_lm(automl, typ=2)
+print(aov_table)
\ No newline at end of file

From 1d9cd4513d561c19ff28ee56896da5ed9bd32570 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 17 Oct 2022 13:09:10 -0400
Subject: [PATCH 04/17] Visualization created with important features

---
 flaml/automl.py | 20 ++++----------------
 test.py         | 50 -------------------------------------------------
 2 files changed, 4 insertions(+), 66 deletions(-)
 delete mode 100644 test.py

diff --git a/flaml/automl.py b/flaml/automl.py
index 7163da9562..c39dd397e8 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2931,32 +2931,20 @@ def is_to_reverse_metric(metric, task):
             del self._state.groups, self._state.groups_all, self._state.groups_val
         logger.setLevel(old_level)
 
-    def viz(self, 
+    def vizualization(self, 
             title = None, 
             xlab = None,
             ylab = None,
-            plottype = None,
+            type = None,
             time_history = None,
             valid_loss_history = None,
             best_valid_loss_history = None,
             ):
-        if plottype == "scatter":
-            plt.title(title)
-            plt.xlabel(xlab)
-            plt.ylabel(ylab)
-            plt.scatter(time_history, 1 - np.array(valid_loss_history))
-            plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
-            plt.show()
-        elif plottype == "feature":
+        if type == "feature":
             plt.title(title)
             plt.barh(self.feature_names_in_, self.feature_importances_)
             plt.show()
-        elif plottype == "Model":
-            # pie graph that shows percentage of each model with the two best
-            print("best model b")
-        elif plottype == "parameters":
-            # ANOVA 
-            print("dees the best")
+
 
 
 
diff --git a/test.py b/test.py
deleted file mode 100644
index b9e5bb108a..0000000000
--- a/test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from flaml import AutoML
-from flaml.data import load_openml_dataset
-from flaml.data import get_output_from_log
-import matplotlib.pyplot as plt
-import statsmodels.api as sm
-
-X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')
-print("Data type:", type(X_train), type(y_train))
-print("The first 5 rows of X_train:")
-print(X_train.head())
-print("The first 5 rows of y_train:")
-print(y_train.head()) 
-
-automl = AutoML()
-
-settings = {
-    "time_budget": 60,  # total running time in seconds
-    "metric": 'accuracy',  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
-                           # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
-    "task": 'classification',  # task type
-    "log_file_name": 'airlines_experiment.log',  # flaml log file
-    "seed": 7654321,    # random seed
-}
-
-automl.fit(X_train=X_train, y_train=y_train, **settings)
-
- 
-time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
-    get_output_from_log(filename=settings['log_file_name'], time_budget=240)
-for config in config_history:
-    print(config)
-print(metric_history)
-
-# ATMSeer: increasing transparency and controllability in automated ml page 4
-# Whither Automl? Understanding the role of automation in machine learning
-# Which parameter is more important, which models are working the best dataset  
-# XAutoML A visual analytics tools for establishing trust
-
-
-t = "Learning Curve"
-xl = "Wall Clock Time (s)"
-yl = "Validation Accuracy"
-pt = "feature"
-bvlh = best_valid_loss_history
-vlh = valid_loss_history
-th = time_history
-automl.viz(t, xl, yl, pt)
-
-aov_table = sm.stats.anova_lm(automl, typ=2)
-print(aov_table)
\ No newline at end of file

From a9d5f57a5241eece64bcb6cfe174613b348726f8 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 17 Oct 2022 13:35:08 -0400
Subject: [PATCH 05/17] Renamed type to feature_importance to be specific

---
 flaml/automl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index 55ecd6569e..210cb71fc5 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2938,7 +2938,7 @@ def vizualization(self,
             valid_loss_history = None,
             best_valid_loss_history = None,
             ):
-        if type == "feature":
+        if type == "feature_importance":
             plt.title(title)
             plt.barh(self.feature_names_in_, self.feature_importances_)
             plt.show()

From cb9ed36c1f4dac84d78b3db650e79b16026285da Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 17 Oct 2022 13:39:06 -0400
Subject: [PATCH 06/17] Adding minor comments

---
 flaml/automl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index 210cb71fc5..bbbb790bc7 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2929,7 +2929,9 @@ def is_to_reverse_metric(metric, task):
             del self._state.groups, self._state.groups_all, self._state.groups_val
         logger.setLevel(old_level)
 
-    def vizualization(self, 
+
+    # A visualization API for FLAML to improve explainability of the automation
+    def visualization(self, 
             title = None, 
             xlab = None,
             ylab = None,
@@ -2938,6 +2940,7 @@ def vizualization(self,
             valid_loss_history = None,
             best_valid_loss_history = None,
             ):
+        # Showing the feature importance of the data that was trained on
         if type == "feature_importance":
             plt.title(title)
             plt.barh(self.feature_names_in_, self.feature_importances_)

From 5fff7b913a8e56dc20b6ee63315edfa790ce42ba Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 17 Oct 2022 13:55:26 -0400
Subject: [PATCH 07/17] Included an example in visualization

---
 flaml/automl.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/flaml/automl.py b/flaml/automl.py
index bbbb790bc7..75c032b51b 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2945,6 +2945,13 @@ def visualization(self,
             plt.title(title)
             plt.barh(self.feature_names_in_, self.feature_importances_)
             plt.show()
+            '''
+            Example:
+                automl = AutoML()
+                automl.fit(***settings and data)
+                automl.visualization(title = "", type = "feature_importance")
+                It will then display the graph
+            '''
 
 
 

From 5e868cf7c783f8ad38f98bc1921ea42f3a8e40d0 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 31 Oct 2022 12:57:31 -0400
Subject: [PATCH 08/17] Adding the logger warning and save fig

---
 flaml/automl.py | 11 +++++++++--
 setup.py        |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index 75c032b51b..76dccefc17 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -20,7 +20,6 @@
 from sklearn.utils import shuffle
 from sklearn.base import BaseEstimator
 import pandas as pd
-import matplotlib.pyplot as plt
 import logging
 import json
 from .ml import (
@@ -2940,11 +2939,19 @@ def visualization(self,
             valid_loss_history = None,
             best_valid_loss_history = None,
             ):
+        try:
+            import matplotlib as plt
+        except ImportError:
+            matplotlib = None
+            logger.warning(
+                        "In order to the use the visualization functionality of FLAML use pip install matplotlib."
+                    )
         # Showing the feature importance of the data that was trained on
         if type == "feature_importance":
+            plotfilename = input("Enter a filename to save the feature importance figure:\n")
             plt.title(title)
             plt.barh(self.feature_names_in_, self.feature_importances_)
-            plt.show()
+            plt.savefig("{plotfilename}.png")
             '''
             Example:
                 automl = AutoML()
diff --git a/setup.py b/setup.py
index 39fa7dc6c8..f7488ecde0 100644
--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,7 @@
             "hcrystalball==0.1.10",
             "seqeval",
             "pytorch-forecasting>=0.9.0",
+            "matplotlib"
         ],
         "catboost": ["catboost>=0.26"],
         "blendsearch": ["optuna==2.8.0"],
@@ -100,6 +101,7 @@
             "hcrystalball==0.1.10",
             "pytorch-forecasting>=0.9.0",
         ],
+        "visualization": ["matplotlib"]
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
     },
     classifiers=[

From 78f08f726f6a61aa2e9006085f88f87995a94006 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 31 Oct 2022 13:45:59 -0400
Subject: [PATCH 09/17] Forgot a comma

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index db5cbd554a..0d3225959e 100644
--- a/setup.py
+++ b/setup.py
@@ -104,7 +104,7 @@
             "hcrystalball==0.1.10",
             "pytorch-forecasting>=0.9.0",
         ],
-        "visualization": ["matplotlib"]
+        "visualization": ["matplotlib"],
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
     },
     classifiers=[

From 0cfb2f7e07a6b6193e58f42057c733e13d7991b3 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 7 Nov 2022 15:33:55 -0500
Subject: [PATCH 10/17] Feature and validation updates

---
 flaml/automl.py | 111 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 106 insertions(+), 5 deletions(-)

diff --git a/flaml/automl.py b/flaml/automl.py
index 76dccefc17..a9b6571a4d 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2944,14 +2944,115 @@ def visualization(self,
         except ImportError:
             matplotlib = None
             logger.warning(
-                        "In order to the use the visualization functionality of FLAML use pip install matplotlib."
-                    )
+                "In order to the use the visualization functionality of FLAML use pip install matplotlib."
+            )
+        try:
+            import lime
+        except ImportError:
+            lime = None
+            logger.warning(
+                "In order to the use the visualization functionality of FLAML use pip install lime."
+            )
+        try:
+            import shap
+        except ImportError:
+            shap = None
+            logger.warning(
+                "In order to the use the visualization functionality of FLAML use pip install shap."
+            )
         # Showing the feature importance of the data that was trained on
         if type == "feature_importance":
+            feature_importance_type = int(input("Choose 1 for basic feature importance and choose 2 to use Lime for a move advanced calculation of feature importance. \n"))
             plotfilename = input("Enter a filename to save the feature importance figure:\n")
-            plt.title(title)
-            plt.barh(self.feature_names_in_, self.feature_importances_)
-            plt.savefig("{plotfilename}.png")
+            if feature_importance_type == 1:
+                plt.title(title)
+                plt.barh(self.feature_names_in_, self.feature_importances_)
+                plt.savefig("{plotfilename}.png")
+            elif feature_importance_type == 2:
+                ''' The code for calculating feature importance with Lime and Diagnose was provided by group 7 in DS 440'''
+                estimator = getattr(self, "_trained_estimator", None)
+                if estimator is None:
+                    logger.warning(
+                        "No estimator is trained. Please run fit with enough budget."
+                    )
+                    return None
+
+                if explainer == "LIME":
+                    pandas_xtrain = pd.DataFrame(self._state.X_train)
+                    pandas_xval = pd.DataFrame(self._state.X_val)
+                
+                    if problem_type == "classification":
+                        explain = lime_tabular.LimeTabularExplainer(
+                        training_data=self._state.X_train,
+                        feature_names=pandas_xtrain.columns,
+                        class_names= list(class_names.values()),
+                        mode='classification',
+                        **kwargs)
+                    
+                        print("True Label: {}\n".format(class_names[self._state.y_train[row_index]]))
+
+                        exp = explain.explain_instance(data_row=pandas_xtrain.values[row_index], predict_fn = estimator.predict_proba, **kwargs)
+                        exp.save_to_file('lime_classification.html')
+                        return exp.show_in_notebook(show_table=True)
+                
+                    if problem_type == "regression":
+                        explain = lime_tabular.LimeTabularExplainer(
+                        training_data= np.array(self._state.X_train),
+                        feature_names=pandas_xtrain.columns,
+                        mode='regression',
+                        **kwargs)
+                    
+                        print(self._state.X_val)
+                        print("True Label: {}\n".format(self._state.y_val[row_index]))
+                    
+                        exp = explain.explain_instance(data_row=self._state.X_val[row_index], predict_fn = estimator.predict, **kwargs)
+                        exp.save_to_file('lime_regression.html')
+                        return exp.show_in_notebook(show_table=True)
+                   
+
+                    if problem_type == "text":
+                        pass
+
+
+                if explainer == "SHAP":
+                    if problem_type == "classification":
+                        if model_type == "linear":
+                            explainer = shap.LinearExplainer(estimator.predict_proba, self.train_data, feature_dependence="independent")
+
+                        elif model_type == "tree":
+                            explainer = shap.TreeExplainer(estimator.predict_proba)
+                    
+                        else:
+                            explainer = shap.KernelExplainer(estimator.predict_proba, self._state.X_train)
+                    
+                        shap_values = explainer.shap_values(self._state.X_val)
+                        return shap.summary_plot(shap_values, self._state.X_val)
+
+                    if problem_type == "regression":
+                        print(self._state.X_val)
+                        if plot_type == "waterfall":
+                            explainer = shap.Explainer(estimator.model)
+                            shap_values = explainer(self._state.X_train)
+                            return shap.plots.waterfall(shap_values[row_index])
+                    
+                        if plot_type == "bar":
+                            explainer = shap.Explainer(estimator.model)
+                            shap_values = explainer(self._state.X_train)
+                            return shap.plots.bar(shap_values)
+
+                        if plot_type == "beeswarm":
+                            explainer = shap.Explainer(estimator.model)
+                            shap_values = explainer(self._state.X_train)
+                            return shap.plots.beeswarm(shap_values)
+
+                        if plot_type == "force":
+                            shap.initjs()
+                            explainer = shap.Explainer(estimator.model)
+                            shap_values = explainer(self._state.X_train)
+                            shap_plot = shap.plots.force(shap_values[row_index])
+                            shap.save_html("shap_regression.html", shap_plot)
+                            return shap_plot
+
             '''
             Example:
                 automl = AutoML()

From f53aeeeeb69c6fbb2ff692db43e491cb54ade1aa Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 7 Nov 2022 15:35:15 -0500
Subject: [PATCH 11/17] test file added

---
 flaml/automl.py | 25 +++++++++++++------
 setup.py        |  2 +-
 test.py         | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 9 deletions(-)
 create mode 100644 test.py

diff --git a/flaml/automl.py b/flaml/automl.py
index a9b6571a4d..b93dc782c1 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -2935,12 +2935,10 @@ def visualization(self,
             xlab = None,
             ylab = None,
             type = None,
-            time_history = None,
-            valid_loss_history = None,
-            best_valid_loss_history = None,
+            settings = None,
             ):
         try:
-            import matplotlib as plt
+            import matplotlib.pyplot as plt
         except ImportError:
             matplotlib = None
             logger.warning(
@@ -2962,12 +2960,12 @@ def visualization(self,
             )
         # Showing the feature importance of the data that was trained on
         if type == "feature_importance":
-            feature_importance_type = int(input("Choose 1 for basic feature importance and choose 2 to use Lime for a move advanced calculation of feature importance. \n"))
+            feature_importance_type = int(input("Enter 1 for the model's feature importance and enter 2 to use Lime for a different method of feature importance. \n"))
             plotfilename = input("Enter a filename to save the feature importance figure:\n")
             if feature_importance_type == 1:
-                plt.title(title)
                 plt.barh(self.feature_names_in_, self.feature_importances_)
-                plt.savefig("{plotfilename}.png")
+                plt.savefig("{}.png".format(plotfilename))
+            """
             elif feature_importance_type == 2:
                 ''' The code for calculating feature importance with Lime and Diagnose was provided by group 7 in DS 440'''
                 estimator = getattr(self, "_trained_estimator", None)
@@ -3060,7 +3058,18 @@ def visualization(self,
                 automl.visualization(title = "", type = "feature_importance")
                 It will then display the graph
             '''
-
+            """
+        elif type == "validation_accuracy":
+            from flaml.data import get_output_from_log
+            plotfilename = input("Enter a filename to save the validation accuracy figure:\n")
+            time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
+                get_output_from_log(filename=settings['log_file_name'], time_budget=240)
+            plt.title(title)
+            plt.xlabel(xlab)
+            plt.ylabel(ylab)
+            plt.scatter(time_history, 1 - np.array(valid_loss_history))
+            plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
+            plt.savefig("{}".format(plotfilename))
 
 
 
diff --git a/setup.py b/setup.py
index 0d3225959e..b584515e82 100644
--- a/setup.py
+++ b/setup.py
@@ -104,7 +104,7 @@
             "hcrystalball==0.1.10",
             "pytorch-forecasting>=0.9.0",
         ],
-        "visualization": ["matplotlib"],
+        "visualization": ["matplotlib", "lime", "shap"],
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
     },
     classifiers=[
diff --git a/test.py b/test.py
new file mode 100644
index 0000000000..f6beb4f87b
--- /dev/null
+++ b/test.py
@@ -0,0 +1,66 @@
+from flaml import AutoML
+from flaml.data import load_openml_dataset
+from flaml.data import get_output_from_log
+import matplotlib.pyplot as plt
+import statsmodels.api as sm
+
+X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')
+print("Data type:", type(X_train), type(y_train))
+
+automl = AutoML()
+
+settings = {
+    "time_budget": 60,  # total running time in seconds
+    "metric": 'accuracy',  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
+                           # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
+    "task": 'classification',  # task type
+    "log_file_name": 'airlines_experiment.log',  # flaml log file
+    "seed": 7654321,    # random seed
+}
+
+automl.fit(X_train=X_train, y_train=y_train, **settings)
+
+ 
+time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
+    get_output_from_log(filename=settings['log_file_name'], time_budget=240)
+for config in config_history:
+    print(config)
+print(metric_history)
+
+# ATMSeer: increasing transparency and controllability in automated ml page 4
+# Whither Automl? Understanding the role of automation in machine learning
+# Which parameter is more important, which models are working the best dataset  
+# XAutoML A visual analytics tools for establishing trust
+
+
+t = "Learning Curve"
+xl = "Wall Clock Time (s)"
+yl = "Validation Accuracy"
+pt = "feature"
+bvlh = best_valid_loss_history
+vlh = valid_loss_history
+th = time_history
+automl.visualization(type = "feature_importance")
+automl.visualization(type = "validation_accuracy", xlab = xl, ylab = yl, settings = settings)
+
+
+
+'''
+if plottype == "scatter":
+            plt.title(title)
+            plt.xlabel(xlab)
+            plt.ylabel(ylab)
+            plt.scatter(time_history, 1 - np.array(valid_loss_history))
+            plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
+            plt.show()
+elif plottype == "feature":
+            plt.title(title)
+            plt.barh(self.feature_names_in_, self.feature_importances_)
+            plt.show()
+        elif plottype == "Model":
+            # pie graph that shows percentage of each model with the two best
+            print("best model b")
+        elif plottype == "parameters":
+            # ANOVA 
+            print("dees the best")
+'''
\ No newline at end of file

From a94056885e56283408a0dbe2d7701897ba129871 Mon Sep 17 00:00:00 2001
From: chadhardin <chadhardin6@gmail.com>
Date: Mon, 7 Nov 2022 15:39:25 -0500
Subject: [PATCH 12/17] Updated testing file name

---
 flaml/automl.py                       | 2 ++
 test.py => test/test_visualization.py | 0
 2 files changed, 2 insertions(+)
 rename test.py => test/test_visualization.py (100%)

diff --git a/flaml/automl.py b/flaml/automl.py
index b93dc782c1..b9ba004930 100644
--- a/flaml/automl.py
+++ b/flaml/automl.py
@@ -3064,6 +3064,8 @@ def visualization(self,
             plotfilename = input("Enter a filename to save the validation accuracy figure:\n")
             time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
                 get_output_from_log(filename=settings['log_file_name'], time_budget=240)
+            plt.clf()
+            plt.cla()
             plt.title(title)
             plt.xlabel(xlab)
             plt.ylabel(ylab)
diff --git a/test.py b/test/test_visualization.py
similarity index 100%
rename from test.py
rename to test/test_visualization.py

From aca206bbfb24119411de52c5356b545dd9f119e4 Mon Sep 17 00:00:00 2001
From: Qingyun Wu <qingyun.wu@psu.edu>
Date: Tue, 6 Dec 2022 22:21:34 -0800
Subject: [PATCH 13/17] simplify api

---
 flaml/automl/automl.py            | 180 +++++++-----------------------
 setup.py                          |   5 +-
 test/automl/test_visualization.py |  24 ++++
 test/test_visualization.py        |  66 -----------
 4 files changed, 67 insertions(+), 208 deletions(-)
 create mode 100644 test/automl/test_visualization.py
 delete mode 100644 test/test_visualization.py

diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index baafa7f635..84a003d96c 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -2963,152 +2963,54 @@ def is_to_reverse_metric(metric, task):
             del self._state.groups, self._state.groups_all, self._state.groups_val
         logger.setLevel(old_level)
 
-
-    # A visualization API for FLAML to improve explainability of the automation
-    def visualization(self, 
-            title = None, 
-            xlab = None,
-            ylab = None,
-            type = None,
-            settings = None,
-            ):
+    def visualize(
+        self,
+        type="learning_curve",
+        automl_instance=None,
+        plot_filename=None,
+        log_file_name=None,
+        **kwargs,
+    ):
+        """
+        type: The type of the plot. The default visualization type is the learning curve.
+        automl_instance: An flaml AutoML instance.
+        plot_filename: str | File name
+        log_file_name: str | Log file name
+        """
         try:
             import matplotlib.pyplot as plt
         except ImportError:
-            matplotlib = None
-            logger.warning(
-                "In order to the use the visualization functionality of FLAML use pip install matplotlib."
-            )
-        try:
-            import lime
-        except ImportError:
-            lime = None
-            logger.warning(
-                "In order to the use the visualization functionality of FLAML use pip install lime."
-            )
-        try:
-            import shap
-        except ImportError:
-            shap = None
-            logger.warning(
-                "In order to the use the visualization functionality of FLAML use pip install shap."
+            raise ImportError(
+                "The visualization functionalitye requires installation of matplotlib. "
+                "Please run pip install flaml[visualization]"
             )
-        # Showing the feature importance of the data that was trained on
+
         if type == "feature_importance":
-            feature_importance_type = int(input("Enter 1 for the model's feature importance and enter 2 to use Lime for a different method of feature importance. \n"))
-            plotfilename = input("Enter a filename to save the feature importance figure:\n")
-            if feature_importance_type == 1:
-                plt.barh(self.feature_names_in_, self.feature_importances_)
-                plt.savefig("{}.png".format(plotfilename))
-            """
-            elif feature_importance_type == 2:
-                ''' The code for calculating feature importance with Lime and Diagnose was provided by group 7 in DS 440'''
-                estimator = getattr(self, "_trained_estimator", None)
-                if estimator is None:
-                    logger.warning(
-                        "No estimator is trained. Please run fit with enough budget."
-                    )
-                    return None
-
-                if explainer == "LIME":
-                    pandas_xtrain = pd.DataFrame(self._state.X_train)
-                    pandas_xval = pd.DataFrame(self._state.X_val)
-                
-                    if problem_type == "classification":
-                        explain = lime_tabular.LimeTabularExplainer(
-                        training_data=self._state.X_train,
-                        feature_names=pandas_xtrain.columns,
-                        class_names= list(class_names.values()),
-                        mode='classification',
-                        **kwargs)
-                    
-                        print("True Label: {}\n".format(class_names[self._state.y_train[row_index]]))
-
-                        exp = explain.explain_instance(data_row=pandas_xtrain.values[row_index], predict_fn = estimator.predict_proba, **kwargs)
-                        exp.save_to_file('lime_classification.html')
-                        return exp.show_in_notebook(show_table=True)
-                
-                    if problem_type == "regression":
-                        explain = lime_tabular.LimeTabularExplainer(
-                        training_data= np.array(self._state.X_train),
-                        feature_names=pandas_xtrain.columns,
-                        mode='regression',
-                        **kwargs)
-                    
-                        print(self._state.X_val)
-                        print("True Label: {}\n".format(self._state.y_val[row_index]))
-                    
-                        exp = explain.explain_instance(data_row=self._state.X_val[row_index], predict_fn = estimator.predict, **kwargs)
-                        exp.save_to_file('lime_regression.html')
-                        return exp.show_in_notebook(show_table=True)
-                   
-
-                    if problem_type == "text":
-                        pass
-
-
-                if explainer == "SHAP":
-                    if problem_type == "classification":
-                        if model_type == "linear":
-                            explainer = shap.LinearExplainer(estimator.predict_proba, self.train_data, feature_dependence="independent")
-
-                        elif model_type == "tree":
-                            explainer = shap.TreeExplainer(estimator.predict_proba)
-                    
-                        else:
-                            explainer = shap.KernelExplainer(estimator.predict_proba, self._state.X_train)
-                    
-                        shap_values = explainer.shap_values(self._state.X_val)
-                        return shap.summary_plot(shap_values, self._state.X_val)
-
-                    if problem_type == "regression":
-                        print(self._state.X_val)
-                        if plot_type == "waterfall":
-                            explainer = shap.Explainer(estimator.model)
-                            shap_values = explainer(self._state.X_train)
-                            return shap.plots.waterfall(shap_values[row_index])
-                    
-                        if plot_type == "bar":
-                            explainer = shap.Explainer(estimator.model)
-                            shap_values = explainer(self._state.X_train)
-                            return shap.plots.bar(shap_values)
-
-                        if plot_type == "beeswarm":
-                            explainer = shap.Explainer(estimator.model)
-                            shap_values = explainer(self._state.X_train)
-                            return shap.plots.beeswarm(shap_values)
-
-                        if plot_type == "force":
-                            shap.initjs()
-                            explainer = shap.Explainer(estimator.model)
-                            shap_values = explainer(self._state.X_train)
-                            shap_plot = shap.plots.force(shap_values[row_index])
-                            shap.save_html("shap_regression.html", shap_plot)
-                            return shap_plot
-
-            '''
-            Example:
-                automl = AutoML()
-                automl.fit(***settings and data)
-                automl.visualization(title = "", type = "feature_importance")
-                It will then display the graph
-            '''
-            """
-        elif type == "validation_accuracy":
+            plt.barh(self.feature_names_in_, self.feature_importances_)
+            plt.savefig("{}.png".format(plot_filename))
+            plt.close()
+        elif type == "learning_curve":
             from flaml.data import get_output_from_log
-            plotfilename = input("Enter a filename to save the validation accuracy figure:\n")
-            time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
-                get_output_from_log(filename=settings['log_file_name'], time_budget=240)
-            plt.clf()
-            plt.cla()
-            plt.title(title)
-            plt.xlabel(xlab)
-            plt.ylabel(ylab)
-            plt.scatter(time_history, 1 - np.array(valid_loss_history))
-            plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
-            plt.savefig("{}".format(plotfilename))
-
 
+            log_file_name = kwargs.get("log_file_name")
+            if not log_file_name:
+                log_file_name = self._settings.get("log_file_name")
+            print("log", log_file_name)
+            if not log_file_name:
+                logger.warning("Please provide a search history log file.")
+            (
+                time_history,
+                best_valid_loss_history,
+                valid_loss_history,
+                config_history,
+                metric_history,
+            ) = get_output_from_log(filename=log_file_name, time_budget=240)
+            plt.title("Learning Curve")
+            plt.xlabel("Wall Clock Time (s)")
+            plt.ylabel("Validation Accuracy")
+            plt.scatter(time_history, 1 - np.array(valid_loss_history))
+            plt.step(time_history, 1 - np.array(best_valid_loss_history), where="post")
+            plt.savefig("{}".format(plot_filename))
 
     def _search_parallel(self):
         try:
diff --git a/setup.py b/setup.py
index 3d6b2c6486..1f9fe15576 100644
--- a/setup.py
+++ b/setup.py
@@ -65,8 +65,7 @@
             "rouge_score",
             "hcrystalball==0.1.10",
             "seqeval",
-            "matplotlib"
-            "pytorch-forecasting>=0.9.0,<=0.10.1",
+            "matplotlib" "pytorch-forecasting>=0.9.0,<=0.10.1",
             "mlflow",
         ],
         "catboost": ["catboost>=0.26"],
@@ -103,7 +102,7 @@
             "hcrystalball==0.1.10",
             "pytorch-forecasting>=0.9.0",
         ],
-        "visualization": ["matplotlib", "lime", "shap"],
+        "visualization": ["matplotlib"],
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3"],
     },
     classifiers=[
diff --git a/test/automl/test_visualization.py b/test/automl/test_visualization.py
new file mode 100644
index 0000000000..97190951af
--- /dev/null
+++ b/test/automl/test_visualization.py
@@ -0,0 +1,24 @@
+from flaml import AutoML
+from flaml.data import load_openml_dataset
+
+
+def test_fi_lc():
+    X_train, X_test, y_train, y_test = load_openml_dataset(
+        dataset_id=1169, data_dir="./"
+    )
+    settings = {
+        "time_budget": 10,  # total running time in seconds
+        "metric": "accuracy",  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
+        # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
+        "task": "classification",  # task type
+        "log_file_name": "airlines_experiment.log",  # flaml log file
+        "seed": 7654321,  # random seed
+    }
+    automl = AutoML(**settings)
+    automl.fit(X_train=X_train, y_train=y_train)
+    automl.visualize(type="feature_importance", plot_filename="feature_importance")
+    automl.visualize(type="learning_curve", plot_filename="learning_curve")
+
+
+if __name__ == "__main__":
+    test_fi_lc()
diff --git a/test/test_visualization.py b/test/test_visualization.py
deleted file mode 100644
index f6beb4f87b..0000000000
--- a/test/test_visualization.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from flaml import AutoML
-from flaml.data import load_openml_dataset
-from flaml.data import get_output_from_log
-import matplotlib.pyplot as plt
-import statsmodels.api as sm
-
-X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')
-print("Data type:", type(X_train), type(y_train))
-
-automl = AutoML()
-
-settings = {
-    "time_budget": 60,  # total running time in seconds
-    "metric": 'accuracy',  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
-                           # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
-    "task": 'classification',  # task type
-    "log_file_name": 'airlines_experiment.log',  # flaml log file
-    "seed": 7654321,    # random seed
-}
-
-automl.fit(X_train=X_train, y_train=y_train, **settings)
-
- 
-time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
-    get_output_from_log(filename=settings['log_file_name'], time_budget=240)
-for config in config_history:
-    print(config)
-print(metric_history)
-
-# ATMSeer: increasing transparency and controllability in automated ml page 4
-# Whither Automl? Understanding the role of automation in machine learning
-# Which parameter is more important, which models are working the best dataset  
-# XAutoML A visual analytics tools for establishing trust
-
-
-t = "Learning Curve"
-xl = "Wall Clock Time (s)"
-yl = "Validation Accuracy"
-pt = "feature"
-bvlh = best_valid_loss_history
-vlh = valid_loss_history
-th = time_history
-automl.visualization(type = "feature_importance")
-automl.visualization(type = "validation_accuracy", xlab = xl, ylab = yl, settings = settings)
-
-
-
-'''
-if plottype == "scatter":
-            plt.title(title)
-            plt.xlabel(xlab)
-            plt.ylabel(ylab)
-            plt.scatter(time_history, 1 - np.array(valid_loss_history))
-            plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')
-            plt.show()
-elif plottype == "feature":
-            plt.title(title)
-            plt.barh(self.feature_names_in_, self.feature_importances_)
-            plt.show()
-        elif plottype == "Model":
-            # pie graph that shows percentage of each model with the two best
-            print("best model b")
-        elif plottype == "parameters":
-            # ANOVA 
-            print("dees the best")
-'''
\ No newline at end of file

From 149a97ee0651a174a03340c5159ebcdf2118bf25 Mon Sep 17 00:00:00 2001
From: Qingyun Wu <qingyun.wu@psu.edu>
Date: Wed, 7 Dec 2022 11:51:43 -0500
Subject: [PATCH 14/17] Update setup.py

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1f9fe15576..3e4dadd62b 100644
--- a/setup.py
+++ b/setup.py
@@ -65,7 +65,8 @@
             "rouge_score",
             "hcrystalball==0.1.10",
             "seqeval",
-            "matplotlib" "pytorch-forecasting>=0.9.0,<=0.10.1",
+            "matplotlib",
+            "pytorch-forecasting>=0.9.0,<=0.10.1",
             "mlflow",
         ],
         "catboost": ["catboost>=0.26"],

From df73b4f4c9cf0c52caab410a73f9623b7936efba Mon Sep 17 00:00:00 2001
From: Qingyun Wu <qingyun.wu@psu.edu>
Date: Fri, 23 Dec 2022 19:31:35 -0800
Subject: [PATCH 15/17] merge

---
 .github/workflows/python-package.yml          |  11 +-
 .pre-commit-config.yaml                       |   2 +-
 Dockerfile                                    |  10 +
 README.md                                     |   4 +-
 flaml/automl/automl.py                        | 305 ++++++++----
 flaml/tune/searcher/blendsearch.py            |  11 +-
 flaml/tune/searcher/flow2.py                  |   2 +-
 flaml/tune/spark/__init__.py                  |   8 +
 flaml/tune/spark/utils.py                     | 191 +++++++
 flaml/tune/trial_runner.py                    |  38 ++
 flaml/tune/tune.py                            | 178 ++++++-
 notebook/automl_lightgbm.ipynb                |   6 +-
 notebook/integrate_azureml.ipynb              |   6 +-
 notebook/integrate_spark.ipynb                |   1 +
 notebook/tune_pytorch.ipynb                   |  19 +-
 setup.py                                      |   8 +
 test/automl/__init__.py                       |   0
 test/automl/test_multiclass.py                | 148 +++---
 test/automl/test_python_log.py                |   2 +-
 test/automl/test_training_log.py              |   4 +-
 test/spark/__init__.py                        |   0
 test/spark/custom_mylearner.py                | 124 +++++
 test/spark/mylearner.py                       |  19 +
 test/spark/test_automl.py                     | 108 ++++
 test/spark/test_ensemble.py                   |  57 +++
 test/spark/test_exceptions.py                 |  76 +++
 test/spark/test_multiclass.py                 | 470 ++++++++++++++++++
 test/spark/test_notebook.py                   |  41 ++
 test/spark/test_performance.py                | 110 ++++
 test/spark/test_tune.py                       |  58 +++
 test/spark/test_utils.py                      | 101 ++++
 test/tune/test_pytorch_cifar10.py             |   6 +-
 test/tune/test_searcher.py                    |   4 +-
 .../Examples/Tune-Lexicographic-objectives.md |   3 +-
 website/docs/Examples/Tune-PyTorch.md         |   5 +-
 website/docs/FAQ.md                           |  13 +
 website/docs/Installation.md                  |  22 +
 .../docs/Use-Cases/Task-Oriented-AutoML.md    |  23 +-
 .../Use-Cases/Tune-User-Defined-Function.md   |  28 +-
 39 files changed, 2018 insertions(+), 204 deletions(-)
 create mode 100644 flaml/tune/spark/__init__.py
 create mode 100644 flaml/tune/spark/utils.py
 create mode 100644 notebook/integrate_spark.ipynb
 create mode 100644 test/automl/__init__.py
 create mode 100644 test/spark/__init__.py
 create mode 100644 test/spark/custom_mylearner.py
 create mode 100644 test/spark/mylearner.py
 create mode 100644 test/spark/test_automl.py
 create mode 100644 test/spark/test_ensemble.py
 create mode 100644 test/spark/test_exceptions.py
 create mode 100644 test/spark/test_multiclass.py
 create mode 100644 test/spark/test_notebook.py
 create mode 100644 test/spark/test_performance.py
 create mode 100644 test/spark/test_tune.py
 create mode 100644 test/spark/test_utils.py

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 51fcc40d79..dfc784a058 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-2019]
-        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
 
     steps:
       - uses: actions/checkout@v3
@@ -37,6 +37,15 @@ jobs:
           export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
           export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
           export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
+      - name: On Linux, install Spark stand-alone cluster and PySpark
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          sudo apt-get update && sudo apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends ca-certificates-java ca-certificates openjdk-17-jdk-headless && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*
+          wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz?action=download" -O - | tar -xzC /tmp; archive=$(basename "spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz") bash -c "sudo mv -v /tmp/\${archive/%.tgz/} /spark"
+          pip install --no-cache-dir pyspark>=3.0
+          export SPARK_HOME=/spark
+          export PYTHONPATH=/spark/python/lib/py4j-0.10.9.5-src.zip:/spark/python
+          export PATH=$PATH:$SPARK_HOME/bin
       - name: Install packages and dependencies
         run: |
           python -m pip install --upgrade pip wheel
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 116a702615..dd2c4a71f3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,4 +18,4 @@ repos:
     - id: check-merge-conflict
     - id: detect-private-key
     - id: trailing-whitespace
-    - id: no-commit-to-branch
\ No newline at end of file
+    # - id: no-commit-to-branch
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index bd358f2316..4f0a63aa89 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,6 +3,16 @@ FROM python:3.7
 RUN apt-get update && apt-get -y update
 RUN apt-get install -y sudo git npm
 
+# Install Spark
+RUN sudo apt-get update && sudo apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        ca-certificates-java ca-certificates openjdk-17-jdk-headless \
+        wget \
+    && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*
+RUN wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz?action=download" -O - | tar -xzC /tmp; archive=$(basename "spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz") bash -c "sudo mv -v /tmp/\${archive/%.tgz/} /spark"
+ENV SPARK_HOME=/spark \
+    PYTHONPATH=/spark/python/lib/py4j-0.10.9.5-src.zip:/spark/python
+ENV PATH="${PATH}:${SPARK_HOME}/bin"
+
 # Setup user to not run as root
 RUN adduser --disabled-password --gecos '' flaml-dev
 RUN adduser flaml-dev sudo
diff --git a/README.md b/README.md
index 163ca56550..71362b0239 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ Use the following guides to get started with FLAML in .NET:
 
 - [Install Model Builder](https://docs.microsoft.com/dotnet/machine-learning/how-to-guides/install-model-builder?tabs=visual-studio-2022)
 - [Install ML.NET CLI](https://docs.microsoft.com/dotnet/machine-learning/how-to-guides/install-ml-net-cli?tabs=windows)
-- [Microsoft.AutoML](https://www.nuget.org/packages/Microsoft.ML.AutoML/0.20.0-preview.22313.1)
+- [Microsoft.AutoML](https://www.nuget.org/packages/Microsoft.ML.AutoML/0.20.0)
 
 ## Quickstart
 
@@ -107,7 +107,7 @@ In addition, you can find:
 
 - Contributing guide [here](https://microsoft.github.io/FLAML/docs/Contribute).
 
-- ML.NET documentation and tutorials for [Model Builder](https://docs.microsoft.com/dotnet/machine-learning/tutorials/predict-prices-with-model-builder), [ML.NET CLI](https://docs.microsoft.com/en-us/dotnet/machine-learning/tutorials/sentiment-analysis-cli), and [AutoML API](https://github.com/dotnet/csharp-notebooks/blob/main/machine-learning/03-Training%20and%20AutoML.ipynb).
+- ML.NET documentation and tutorials for [Model Builder](https://learn.microsoft.com/dotnet/machine-learning/tutorials/predict-prices-with-model-builder), [ML.NET CLI](https://learn.microsoft.com/dotnet/machine-learning/tutorials/sentiment-analysis-cli), and [AutoML API](https://learn.microsoft.com/dotnet/machine-learning/how-to-guides/how-to-use-the-automl-api).
 
 ## Contributing
 
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index 7e50aa2122..72fea04d75 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -4,6 +4,7 @@
 #  * project root for license information.
 import time
 import os
+import sys
 from typing import Callable, Optional, List, Union, Any
 import inspect
 from functools import partial
@@ -54,17 +55,28 @@
 from flaml.automl.training_log import training_log_reader, training_log_writer
 from flaml.default import suggest_learner
 from flaml.version import __version__ as flaml_version
+from flaml.tune.spark.utils import check_spark, get_broadcast_data
 
 logger = logging.getLogger(__name__)
 logger_formatter = logging.Formatter(
     "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
 )
+logger.propagate = False
 
 try:
     import mlflow
 except ImportError:
     mlflow = None
 
+try:
+    from ray import __version__ as ray_version
+
+    assert ray_version >= "1.10.0"
+
+    ray_available = True
+except (ImportError, AssertionError):
+    ray_available = False
+
 
 class SearchState:
     @property
@@ -141,31 +153,34 @@ def __init__(
         if custom_hp is not None:
             search_space.update(custom_hp)
 
-        if (
-            isinstance(starting_point, dict)
-            and max_iter
-            > 1  # If the number of starting point is larger than max iter, avoid the checking
-            and not self.valid_starting_point(starting_point, search_space)
-        ):
-            logger.warning(
-                "Starting point {} removed because it is outside of the search space".format(
-                    starting_point
-                )
-            )
-            starting_point = None
-        elif isinstance(starting_point, list) and max_iter > len(
-            starting_point
-        ):  # If the number of starting point is larger than max iter, avoid the checking
-            starting_point_len = len(starting_point)
-            starting_point = [
-                x for x in starting_point if self.valid_starting_point(x, search_space)
-            ]
-            if starting_point_len > len(starting_point):
+        if isinstance(starting_point, dict):
+            starting_point = AutoMLState.sanitize(starting_point)
+            if max_iter > 1 and not self.valid_starting_point(
+                starting_point, search_space
+            ):
+                # If the number of iterations is larger than 1, remove invalid point
                 logger.warning(
-                    "Starting points outside of the search space are removed. "
-                    f"Remaining starting points for {learner_class}: {starting_point}"
+                    "Starting point {} removed because it is outside of the search space".format(
+                        starting_point
+                    )
                 )
-            starting_point = starting_point or None
+                starting_point = None
+        elif isinstance(starting_point, list):
+            starting_point = [AutoMLState.sanitize(x) for x in starting_point]
+            if max_iter > len(starting_point):
+                # If the number of starting points is no smaller than max iter, avoid the checking
+                starting_point_len = len(starting_point)
+                starting_point = [
+                    x
+                    for x in starting_point
+                    if self.valid_starting_point(x, search_space)
+                ]
+                if starting_point_len > len(starting_point):
+                    logger.warning(
+                        "Starting points outside of the search space are removed. "
+                        f"Remaining starting points for {learner_class}: {starting_point}"
+                    )
+                starting_point = starting_point or None
 
         for name, space in search_space.items():
             assert (
@@ -238,7 +253,10 @@ def update(self, result, time_used):
                 and trained_estimator.params.get(trained_estimator.ITER_HP)
             )
             if n_iter:
-                config[trained_estimator.ITER_HP] = n_iter
+                if "ml" in config:
+                    config["ml"][trained_estimator.ITER_HP] = n_iter
+                else:
+                    config[trained_estimator.ITER_HP] = n_iter
         else:
             obj, time2eval, trained_estimator = np.inf, 0.0, None
             metric_for_logging = config = None
@@ -325,7 +343,7 @@ def _prepare_sample_train_data(self, sample_size):
         return sampled_X_train, sampled_y_train, sampled_weight, groups
 
     @staticmethod
-    def _compute_with_config_base(config_w_resource, state, estimator):
+    def _compute_with_config_base(config_w_resource, state, estimator, is_report=True):
         if "FLAML_sample_size" in config_w_resource:
             sample_size = int(config_w_resource["FLAML_sample_size"])
         else:
@@ -401,16 +419,17 @@ def _compute_with_config_base(config_w_resource, state, estimator):
         }
         if sampled_weight is not None:
             this_estimator_kwargs["sample_weight"] = weight
-        tune.report(**result)
+        if is_report is True:
+            tune.report(**result)
         return result
 
-    def sanitize(self, config: dict) -> dict:
+    @classmethod
+    def sanitize(cls, config: dict) -> dict:
         """Make a config ready for passing to estimator."""
         config = config.get("ml", config).copy()
-        if "FLAML_sample_size" in config:
-            del config["FLAML_sample_size"]
-        if "learner" in config:
-            del config["learner"]
+        config.pop("FLAML_sample_size", None)
+        config.pop("learner", None)
+        config.pop("_choice_", None)
         return config
 
     def _train_with_config(
@@ -423,7 +442,7 @@ def _train_with_config(
             sample_size = config_w_resource.get(
                 "FLAML_sample_size", len(self.y_train_all)
             )
-        config = self.sanitize(config_w_resource)
+        config = AutoMLState.sanitize(config_w_resource)
 
         this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
             estimator
@@ -642,7 +661,10 @@ def custom_metric(
             n_concurrent_trials: [Experimental] int, default=1 | The number of
                 concurrent trials. When n_concurrent_trials > 1, flaml performes
                 [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)
-                and installation of ray is required: `pip install flaml[ray]`.
+                and installation of ray or spark is required: `pip install flaml[ray]`
+                or `pip install flaml[spark]`. Please check
+                [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
+                for more details about installing Spark.
             keep_search_state: boolean, default=False | Whether to keep data needed
                 for model search after fit(). By default the state is deleted for
                 space saving.
@@ -662,6 +684,15 @@ def custom_metric(
                 datasets, but will incur more overhead in time.
                 If dict: the dict contains the keywords arguments to be passed to
                 [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).
+            use_spark: boolean, default=False | Whether to use spark to run the training
+                in parallel spark jobs. This can be used to accelerate training on large models
+                and large datasets, but will incur more overhead in time and thus slow down
+                training in some cases. GPU training is not supported yet when use_spark is True.
+                For Spark clusters, by default, we will launch one trial per executor. However,
+                sometimes we want to launch more trials than the number of executors (e.g., local mode).
+                In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override
+                the detected `num_executors`. The final number of concurrent trials will be the minimum
+                of `n_concurrent_trials` and `num_executors`.
             free_mem_ratio: float between 0 and 1, default=0. The free memory ratio to keep during training.
             metric_constraints: list, default=[] | The list of metric constraints.
                 Each element in this list is a 3-tuple, which shall be expressed
@@ -753,6 +784,9 @@ def custom_metric(
         settings["append_log"] = settings.get("append_log", False)
         settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN)
         settings["use_ray"] = settings.get("use_ray", False)
+        settings["use_spark"] = settings.get("use_spark", False)
+        if settings["use_ray"] is not False and settings["use_spark"] is not False:
+            raise ValueError("use_ray and use_spark cannot be both True.")
         settings["free_mem_ratio"] = settings.get("free_mem_ratio", 0)
         settings["metric_constraints"] = settings.get("metric_constraints", [])
         settings["cv_score_agg_func"] = settings.get("cv_score_agg_func", None)
@@ -814,13 +848,15 @@ def best_iteration(self):
     def best_config(self):
         """A dictionary of the best configuration."""
         state = self._search_states.get(self._best_estimator)
-        return state and getattr(state, "best_config", None)
+        config = state and getattr(state, "best_config", None)
+        return config and AutoMLState.sanitize(config)
 
     @property
     def best_config_per_estimator(self):
         """A dictionary of all estimators' best configuration."""
         return {
             e: e_search_state.best_config
+            and AutoMLState.sanitize(e_search_state.best_config)
             for e, e_search_state in self._search_states.items()
         }
 
@@ -1569,7 +1605,7 @@ def get_estimator_from_log(self, log_file_name, record_id, task):
         with training_log_reader(log_file_name) as reader:
             record = reader.get_record(record_id)
             estimator = record.learner
-            config = record.config
+            config = AutoMLState.sanitize(record.config)
 
         estimator, _ = train_estimator(
             X_train=None,
@@ -2073,8 +2109,10 @@ def trainable(self) -> Callable[[dict], Optional[float]]:
         states = self._search_states
         mem_res = self._mem_thres
 
-        def train(config: dict, state):
-
+        def train(config: dict, state, is_report=True):
+            # handle spark broadcast variables
+            state = get_broadcast_data(state)
+            is_report = get_broadcast_data(is_report)
             sample_size = config.get("FLAML_sample_size")
             config = config.get("ml", config).copy()
             if sample_size:
@@ -2083,8 +2121,9 @@ def train(config: dict, state):
             # check memory constraints before training
             if states[estimator].learner_class.size(config) <= mem_res:
                 del config["learner"]
+                config.pop("_choice_", None)
                 result = AutoMLState._compute_with_config_base(
-                    config, state=state, estimator=estimator
+                    config, state=state, estimator=estimator, is_report=is_report
                 )
             else:
                 # If search algorithm is not in flaml, it does not handle the config constraint, should also tune.report before return
@@ -2095,7 +2134,8 @@ def train(config: dict, state):
                     "val_loss": np.inf,
                     "trained_estimator": None,
                 }
-            tune.report(**result)
+            if is_report is True:
+                tune.report(**result)
             return result
 
         if self._use_ray is not False:
@@ -2105,6 +2145,10 @@ def train(config: dict, state):
                 train,
                 state=self._state,
             )
+        elif self._use_spark:
+            from flaml.tune.spark.utils import with_parameters
+
+            return with_parameters(train, state=self._state, is_report=False)
         else:
             return partial(
                 train,
@@ -2165,6 +2209,7 @@ def fit(
         auto_augment=None,
         min_sample_size=None,
         use_ray=None,
+        use_spark=None,
         free_mem_ratio=0,
         metric_constraints=None,
         custom_hp=None,
@@ -2338,7 +2383,10 @@ def custom_metric(
             n_concurrent_trials: [Experimental] int, default=1 | The number of
                 concurrent trials. When n_concurrent_trials > 1, flaml performes
                 [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)
-                and installation of ray is required: `pip install flaml[ray]`.
+                and installation of ray or spark is required: `pip install flaml[ray]`
+                or `pip install flaml[spark]`. Please check
+                [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
+                for more details about installing Spark.
             keep_search_state: boolean, default=False | Whether to keep data needed
                 for model search after fit(). By default the state is deleted for
                 space saving.
@@ -2358,6 +2406,10 @@ def custom_metric(
                 datasets, but will incur more overhead in time.
                 If dict: the dict contains the keywords arguments to be passed to
                 [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).
+            use_spark: boolean, default=False | Whether to use spark to run the training
+                in parallel spark jobs. This can be used to accelerate training on large models
+                and large datasets, but will incur more overhead in time and thus slow down
+                training in some cases.
             free_mem_ratio: float between 0 and 1, default=0. The free memory ratio to keep during training.
             metric_constraints: list, default=[] | The list of metric constraints.
                 Each element in this list is a 3-tuple, which shall be expressed
@@ -2551,12 +2603,50 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
         )
         min_sample_size = min_sample_size or self._settings.get("min_sample_size")
         use_ray = self._settings.get("use_ray") if use_ray is None else use_ray
+        use_spark = self._settings.get("use_spark") if use_spark is None else use_spark
+        spark_available, spark_error_msg = check_spark()
+        if use_spark and use_ray is not False:
+            raise ValueError("use_spark and use_ray cannot be both True.")
+        elif use_spark and not spark_available:
+            raise spark_error_msg
+
+        old_level = logger.getEffectiveLevel()
+        self.verbose = verbose
+        logger.setLevel(50 - verbose * 10)
+        if not logger.handlers:
+            # Add the console handler.
+            _ch = logging.StreamHandler(stream=sys.stdout)
+            _ch.setFormatter(logger_formatter)
+            logger.addHandler(_ch)
+
+        if not use_ray and not use_spark and n_concurrent_trials > 1:
+            if ray_available:
+                logger.warning(
+                    "n_concurrent_trials > 1 is only supported when using Ray or Spark. "
+                    "Ray installed, setting use_ray to True. If you want to use Spark, set use_spark to True."
+                )
+                use_ray = True
+            elif spark_available:
+                logger.warning(
+                    "n_concurrent_trials > 1 is only supported when using Ray or Spark. "
+                    "Spark installed, setting use_spark to True. If you want to use Ray, set use_ray to True."
+                )
+                use_spark = True
+            else:
+                logger.warning(
+                    "n_concurrent_trials > 1 is only supported when using Ray or Spark. "
+                    "Neither Ray nor Spark installed, setting n_concurrent_trials to 1."
+                )
+                n_concurrent_trials = 1
+
         self._state.n_jobs = n_jobs
         self._n_concurrent_trials = n_concurrent_trials
         self._early_stop = early_stop
-        self._use_ray = use_ray or n_concurrent_trials > 1
+        self._use_spark = use_spark
+        self._use_ray = use_ray
         # use the following condition if we have an estimation of average_trial_time and average_trial_overhead
-        # self._use_ray = use_ray or n_concurrent_trials > ( average_trail_time + average_trial_overhead) / (average_trial_time)
+        # self._use_ray = use_ray or n_concurrent_trials > ( average_trial_time + average_trial_overhead) / (average_trial_time)
+
         if self._use_ray is not False:
             import ray
 
@@ -2584,6 +2674,11 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
                 X_train = ray.get(X_train)
             elif isinstance(dataframe, ray.ObjectRef):
                 dataframe = ray.get(dataframe)
+        else:
+            # TODO: Integrate with Spark
+            self._state.resources_per_trial = (
+                {"cpu": n_jobs} if n_jobs > 0 else {"cpu": 1}
+            )
         self._state.free_mem_ratio = (
             self._settings.get("free_mem_ratio")
             if free_mem_ratio is None
@@ -2614,14 +2709,6 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
         self._random = np.random.RandomState(RANDOM_SEED)
         self._seed = seed if seed is not None else 20
         self._learner_selector = learner_selector
-        old_level = logger.getEffectiveLevel()
-        self.verbose = verbose
-        logger.setLevel(50 - verbose * 10)
-        if not logger.handlers:
-            # Add the console handler.
-            _ch = logging.StreamHandler()
-            _ch.setFormatter(logger_formatter)
-            logger.addHandler(_ch)
         logger.info(f"task = {task}")
         self._decide_split_type(split_type)
         logger.info(f"Data split method: {self._split_type}")
@@ -2807,6 +2894,7 @@ def is_to_reverse_metric(metric, task):
             logger.warning(
                 "No search budget is provided via time_budget or max_iter."
                 " Training only one model per estimator."
+                " Zero-shot AutoML is used for certain tasks and estimators."
                 " To tune hyperparameters for each estimator,"
                 " please provide budget either via time_budget or max_iter."
             )
@@ -2916,7 +3004,7 @@ def is_to_reverse_metric(metric, task):
             else (
                 "bs"
                 if n_concurrent_trials > 1
-                or self._use_ray is not False
+                or (self._use_ray is not False or self._use_spark)
                 and len(estimator_list) > 1
                 else "cfo"
             )
@@ -3013,20 +3101,24 @@ def visualize(
             plt.savefig("{}".format(plot_filename))
 
     def _search_parallel(self):
-        try:
-            from ray import __version__ as ray_version
+        if self._use_ray is not False:
+            try:
+                from ray import __version__ as ray_version
+
+                assert ray_version >= "1.10.0"
+                if ray_version.startswith("1."):
+                    from ray.tune.suggest import ConcurrencyLimiter
+                else:
+                    from ray.tune.search import ConcurrencyLimiter
+                import ray
+            except (ImportError, AssertionError):
+                raise ImportError(
+                    "use_ray=True requires installation of ray. "
+                    "Please run pip install flaml[ray]"
+                )
+        else:
+            from flaml.tune.searcher.suggestion import ConcurrencyLimiter
 
-            assert ray_version >= "1.10.0"
-            if ray_version.startswith("1."):
-                from ray.tune.suggest import ConcurrencyLimiter
-            else:
-                from ray.tune.search import ConcurrencyLimiter
-            import ray
-        except (ImportError, AssertionError):
-            raise ImportError(
-                "n_concurrent_trial>1 or use_ray=True requires installation of ray. "
-                "Please run pip install flaml[ray]"
-            )
         if self._hpo_method in ("cfo", "grid"):
             from flaml import CFO as SearchAlgo
         elif "bs" == self._hpo_method:
@@ -3034,15 +3126,20 @@ def _search_parallel(self):
         elif "random" == self._hpo_method:
             from flaml import RandomSearch as SearchAlgo
         elif "optuna" == self._hpo_method:
-            try:
-                from ray import __version__ as ray_version
+            if self._use_ray is not False:
+                try:
+                    from ray import __version__ as ray_version
 
-                assert ray_version >= "1.10.0"
-                if ray_version.startswith("1."):
-                    from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
-                else:
-                    from ray.tune.search.optuna import OptunaSearch as SearchAlgo
-            except (ImportError, AssertionError):
+                    assert ray_version >= "1.10.0"
+                    if ray_version.startswith("1."):
+                        from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
+                    else:
+                        from ray.tune.search.optuna import OptunaSearch as SearchAlgo
+                except (ImportError, AssertionError):
+                    from flaml.tune.searcher.suggestion import (
+                        OptunaSearch as SearchAlgo,
+                    )
+            else:
                 from flaml.tune.searcher.suggestion import OptunaSearch as SearchAlgo
         else:
             raise NotImplementedError(
@@ -3086,7 +3183,7 @@ def _search_parallel(self):
                 allow_empty_config=True,
             )
         else:
-            # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
+            # if self._hpo_method is optuna, sometimes the search space and the initial config dimension do not match
             # need to remove the extra keys from the search space to be consistent with the initial config
             converted_space = SearchAlgo.convert_search_space(space)
 
@@ -3108,21 +3205,40 @@ def _search_parallel(self):
         search_alg = ConcurrencyLimiter(search_alg, self._n_concurrent_trials)
         resources_per_trial = self._state.resources_per_trial
 
-        analysis = ray.tune.run(
-            self.trainable,
-            search_alg=search_alg,
-            config=space,
-            metric="val_loss",
-            mode="min",
-            resources_per_trial=resources_per_trial,
-            time_budget_s=time_budget_s,
-            num_samples=self._max_iter,
-            verbose=max(self.verbose - 2, 0),
-            raise_on_failed_trial=False,
-            keep_checkpoints_num=1,
-            checkpoint_score_attr="min-val_loss",
-            **self._use_ray if isinstance(self._use_ray, dict) else {},
-        )
+        if self._use_spark:
+            # use spark as parallel backend
+            analysis = tune.run(
+                self.trainable,
+                search_alg=search_alg,
+                config=space,
+                metric="val_loss",
+                mode="min",
+                time_budget_s=time_budget_s,
+                num_samples=self._max_iter,
+                verbose=max(self.verbose - 2, 0),
+                use_ray=False,
+                use_spark=True,
+                # raise_on_failed_trial=False,
+                # keep_checkpoints_num=1,
+                # checkpoint_score_attr="min-val_loss",
+            )
+        else:
+            # use ray as parallel backend
+            analysis = ray.tune.run(
+                self.trainable,
+                search_alg=search_alg,
+                config=space,
+                metric="val_loss",
+                mode="min",
+                resources_per_trial=resources_per_trial,
+                time_budget_s=time_budget_s,
+                num_samples=self._max_iter,
+                verbose=max(self.verbose - 2, 0),
+                raise_on_failed_trial=False,
+                keep_checkpoints_num=1,
+                checkpoint_score_attr="min-val_loss",
+                **self._use_ray if isinstance(self._use_ray, dict) else {},
+            )
         # logger.info([trial.last_result for trial in analysis.trials])
         trials = sorted(
             (
@@ -3326,7 +3442,7 @@ def _search_sequential(self):
                         num_samples=self._max_iter,
                     )
                 else:
-                    # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
+                    # if self._hpo_method is optuna, sometimes the search space and the initial config dimension do not match
                     # need to remove the extra keys from the search space to be consistent with the initial config
                     converted_space = SearchAlgo.convert_search_space(search_space)
                     removed_keys = set(search_space.keys()).difference(
@@ -3365,6 +3481,7 @@ def _search_sequential(self):
                 time_budget_s=time_budget_s,
                 verbose=max(self.verbose - 3, 0),
                 use_ray=False,
+                use_spark=False,
             )
             time_used = time.time() - start_run_time
             better = False
@@ -3535,7 +3652,7 @@ def _search(self):
             self._selected = state = self._search_states[estimator]
             state.best_config_sample_size = self._state.data_size[0]
             state.best_config = state.init_config[0] if state.init_config else {}
-        elif self._use_ray is False:
+        elif self._use_ray is False and self._use_spark is False:
             self._search_sequential()
         else:
             self._search_parallel()
@@ -3566,7 +3683,7 @@ def _search(self):
                         x[1].learner_class(
                             task=self._state.task,
                             n_jobs=self._state.n_jobs,
-                            **self._state.sanitize(x[1].best_config),
+                            **AutoMLState.sanitize(x[1].best_config),
                         ),
                     )
                     for x in search_states[:2]
@@ -3577,7 +3694,7 @@ def _search(self):
                         x[1].learner_class(
                             task=self._state.task,
                             n_jobs=self._state.n_jobs,
-                            **self._state.sanitize(x[1].best_config),
+                            **AutoMLState.sanitize(x[1].best_config),
                         ),
                     )
                     for x in search_states[2:]
@@ -3599,6 +3716,10 @@ def _search(self):
                         and ray.available_resources()["CPU"]
                         or os.cpu_count()
                     )
+                elif self._use_spark:
+                    from flaml.tune.spark.utils import get_n_cpus
+
+                    n_cpus = get_n_cpus()
                 else:
                     n_cpus = os.cpu_count()
                 ensemble_n_jobs = (
diff --git a/flaml/tune/searcher/blendsearch.py b/flaml/tune/searcher/blendsearch.py
index e1227aa772..f23d35c226 100644
--- a/flaml/tune/searcher/blendsearch.py
+++ b/flaml/tune/searcher/blendsearch.py
@@ -124,7 +124,7 @@ def __init__(
                 objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives.
                 - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
                 metric names (provided in "metric"), and the values are the numerical target values.
-                - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+                - "tolerances" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
                 metric names (provided in "metrics"), and the values are the numerical tolerances values.
                 E.g.,
                 ```python
@@ -653,10 +653,11 @@ def _expand_admissible_region(self, lower, upper, space):
         for key in upper:
             ub = upper[key]
             if isinstance(ub, list):
-                choice = space[key]["_choice_"]
-                self._expand_admissible_region(
-                    lower[key][choice], upper[key][choice], space[key]
-                )
+                choice = space[key].get("_choice_")
+                if choice:
+                    self._expand_admissible_region(
+                        lower[key][choice], upper[key][choice], space[key]
+                    )
             elif isinstance(ub, dict):
                 self._expand_admissible_region(lower[key], ub, space[key])
             else:
diff --git a/flaml/tune/searcher/flow2.py b/flaml/tune/searcher/flow2.py
index 799660e63f..ce097ba0b2 100644
--- a/flaml/tune/searcher/flow2.py
+++ b/flaml/tune/searcher/flow2.py
@@ -80,7 +80,7 @@ def __init__(
                 objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives
                 - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
                 metric names (provided in "metric"), and the values are the numerical target values.
-                - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+                - "tolerances" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
                 metric names (provided in "metrics"), and the values are the numerical tolerances values.
                 E.g.,
                 ```python
diff --git a/flaml/tune/spark/__init__.py b/flaml/tune/spark/__init__.py
new file mode 100644
index 0000000000..873af1534c
--- /dev/null
+++ b/flaml/tune/spark/__init__.py
@@ -0,0 +1,8 @@
+from flaml.tune.spark.utils import (
+    check_spark,
+    get_n_cpus,
+    with_parameters,
+    broadcast_code,
+)
+
+__all__ = ["check_spark", "get_n_cpus", "with_parameters", "broadcast_code"]
diff --git a/flaml/tune/spark/utils.py b/flaml/tune/spark/utils.py
new file mode 100644
index 0000000000..03337c59b6
--- /dev/null
+++ b/flaml/tune/spark/utils.py
@@ -0,0 +1,191 @@
+import os
+import logging
+from functools import partial, lru_cache
+import textwrap
+
+logger = logging.getLogger(__name__)
+logger_formatter = logging.Formatter(
+    "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
+)
+
+try:
+    from pyspark.sql import SparkSession
+    from pyspark.util import VersionUtils
+    import pyspark
+
+    _have_spark = True
+    _spark_major_minor_version = VersionUtils.majorMinorVersion(pyspark.__version__)
+except ImportError as e:
+    logger.debug("Could not import pyspark: %s", e)
+    _have_spark = False
+    _spark_major_minor_version = (0, 0)
+
+
+@lru_cache(maxsize=2)
+def check_spark():
+    """Check if Spark is installed and running.
+    Result of the function will be cached since test once is enough. As lru_cache will not
+    cache exceptions, we don't raise exceptions here but only log a warning message.
+
+    Returns:
+        Return (True, None) if the check passes, otherwise log the exception message and
+        return (False, Exception(msg)). The exception can be raised by the caller.
+    """
+    logger.warning("\ncheck Spark installation...This line should appear only once.\n")
+    if not _have_spark:
+        msg = """use_spark=True requires installation of PySpark. Please run pip install flaml[spark]
+        and check [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
+        for more details about installing Spark."""
+        logger.warning(msg)
+        return False, ImportError(msg)
+
+    if _spark_major_minor_version[0] < 3:
+        msg = "Spark version must be >= 3.0 to use flaml[spark]"
+        logger.warning(msg)
+        return False, ImportError(msg)
+
+    try:
+        SparkSession.builder.getOrCreate()
+    except RuntimeError as e:
+        logger.warning(f"\nSparkSession is not available: {e}\n")
+        return False, RuntimeError(e)
+
+    return True, None
+
+
+def get_n_cpus(node="driver"):
+    """Get the number of CPU cores of the given type of node.
+
+    Args:
+        node: string | The type of node to get the number of cores. Can be 'driver' or 'executor'.
+            Default is 'driver'.
+
+    Returns:
+        An int of the number of CPU cores.
+    """
+    assert node in ["driver", "executor"]
+    try:
+        n_cpus = int(
+            SparkSession.builder.getOrCreate()
+            .sparkContext.getConf()
+            .get(f"spark.{node}.cores")
+        )
+    except (TypeError, RuntimeError):
+        n_cpus = os.cpu_count()
+    return n_cpus
+
+
+def with_parameters(trainable, **kwargs):
+    """Wrapper for trainables to pass arbitrary large data objects.
+
+    This wrapper function will store all passed parameters in the Spark
+    Broadcast variable.
+
+    Args:
+        trainable: Trainable to wrap.
+        **kwargs: parameters to store in object store.
+
+    Returns:
+        A new function with partial application of the given arguments
+        and keywords. The given arguments and keywords will be broadcasted
+        to all the executors.
+
+
+    ```python
+    import pyspark
+    import flaml
+    from sklearn.datasets import load_iris
+    def train(config, data=None):
+        if isinstance(data, pyspark.broadcast.Broadcast):
+            data = data.value
+        print(config, data)
+
+    data = load_iris()
+    with_parameters_train = flaml.tune.spark.utils.with_parameters(train, data=data)
+    with_parameters_train(config=1)
+    train(config={"metric": "accuracy"})
+    ```
+    """
+
+    if not callable(trainable):
+        raise ValueError(
+            f"`with_parameters() only works with function trainables`. "
+            f"Got type: "
+            f"{type(trainable)}."
+        )
+
+    spark_available, spark_error_msg = check_spark()
+    if not spark_available:
+        raise spark_error_msg
+    spark = SparkSession.builder.getOrCreate()
+
+    bc_kwargs = dict()
+    for k, v in kwargs.items():
+        bc_kwargs[k] = spark.sparkContext.broadcast(v)
+
+    return partial(trainable, **bc_kwargs)
+
+
+def broadcast_code(custom_code="", file_name="mylearner"):
+    """Write customized learner/metric code contents to a file for importing.
+    It is necessary for using the customized learner/metric in spark backend.
+    The path of the learner/metric file will be returned.
+
+    Args:
+        custom_code: str, default="" | code contents of the custom learner/metric.
+        file_name: str, default="mylearner" | file name of the custom learner/metric.
+
+    Returns:
+        The path of the custom code file.
+    ```python
+    from flaml.tune.spark.utils import broadcast_code
+    from flaml.automl.model import LGBMEstimator
+
+    custom_code = '''
+    from flaml.automl.model import LGBMEstimator
+    from flaml import tune
+
+    class MyLargeLGBM(LGBMEstimator):
+        @classmethod
+        def search_space(cls, **params):
+            return {
+                "n_estimators": {
+                    "domain": tune.lograndint(lower=4, upper=32768),
+                    "init_value": 32768,
+                    "low_cost_init_value": 4,
+                },
+                "num_leaves": {
+                    "domain": tune.lograndint(lower=4, upper=32768),
+                    "init_value": 32768,
+                    "low_cost_init_value": 4,
+                },
+            }
+    '''
+
+    broadcast_code(custom_code=custom_code)
+    from flaml.tune.spark.mylearner import MyLargeLGBM
+    assert isinstance(MyLargeLGBM(), LGBMEstimator)
+    ```
+    """
+    flaml_path = os.path.dirname(os.path.abspath(__file__))
+    custom_code = textwrap.dedent(custom_code)
+    custom_path = os.path.join(flaml_path, file_name + ".py")
+
+    with open(custom_path, "w") as f:
+        f.write(custom_code)
+
+    return custom_path
+
+
+def get_broadcast_data(broadcast_data):
+    """Get the broadcast data from the broadcast variable.
+
+    Args:
+        broadcast_data: pyspark.broadcast.Broadcast | the broadcast variable.
+
+    Returns:
+        The broadcast data.
+    """
+    if _have_spark and isinstance(broadcast_data, pyspark.broadcast.Broadcast):
+        broadcast_data = broadcast_data.value
+    return broadcast_data
diff --git a/flaml/tune/trial_runner.py b/flaml/tune/trial_runner.py
index 6aa2bcd5bc..8fe8185a73 100644
--- a/flaml/tune/trial_runner.py
+++ b/flaml/tune/trial_runner.py
@@ -135,3 +135,41 @@ def step(self) -> Trial:
     def stop_trial(self, trial):
         super().stop_trial(trial)
         self.running_trial = None
+
+
+class SparkTrialRunner(BaseTrialRunner):
+    """Implementation of the spark trial runner."""
+
+    def __init__(
+        self,
+        search_alg=None,
+        scheduler=None,
+        metric: Optional[str] = None,
+        mode: Optional[str] = "min",
+    ):
+        super().__init__(search_alg, scheduler, metric, mode)
+        self.running_trials = []
+
+    def step(self) -> Trial:
+        """Runs one step of the trial event loop.
+
+        Callers should typically run this method repeatedly in a loop. They
+        may inspect or modify the runner's state in between calls to step().
+
+        Returns:
+            a trial to run.
+        """
+        trial_id = Trial.generate_id()
+        config = self._search_alg.suggest(trial_id)
+        if config is not None:
+            trial = SimpleTrial(config, trial_id)
+            self.add_trial(trial)
+            trial.set_status(Trial.RUNNING)
+            self.running_trials.append(trial)
+        else:
+            trial = None
+        return trial
+
+    def stop_trial(self, trial):
+        super().stop_trial(trial)
+        self.running_trials.remove(trial)
diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py
index bc2e11ada5..7b5b2a62fe 100644
--- a/flaml/tune/tune.py
+++ b/flaml/tune/tune.py
@@ -7,6 +7,7 @@
 import datetime
 import time
 import os
+import sys
 from collections import defaultdict
 
 try:
@@ -15,9 +16,9 @@
     assert ray_version >= "1.10.0"
     from ray.tune.analysis import ExperimentAnalysis as EA
 
-    ray_import = True
+    ray_available = True
 except (ImportError, AssertionError):
-    ray_import = False
+    ray_available = False
     from .analysis import ExperimentAnalysis as EA
 
 from .trial import Trial
@@ -25,6 +26,7 @@
 import logging
 
 logger = logging.getLogger(__name__)
+logger.propagate = False
 _use_ray = True
 _runner = None
 _verbose = 0
@@ -226,6 +228,7 @@ def run(
     metric_constraints: Optional[List[Tuple[str, str, float]]] = None,
     max_failure: Optional[int] = 100,
     use_ray: Optional[bool] = False,
+    use_spark: Optional[bool] = False,
     use_incumbent_result_in_evaluation: Optional[bool] = None,
     log_file_name: Optional[str] = None,
     lexico_objectives: Optional[dict] = None,
@@ -359,9 +362,10 @@ def easy_objective(config):
         print(analysis.trials[-1].last_result)
     ```
 
-        verbose: 0, 1, 2, or 3. Verbosity mode for ray if ray backend is used.
-            0 = silent, 1 = only status updates, 2 = status and brief trial
-            results, 3 = status and detailed trial results. Defaults to 2.
+        verbose: 0, 1, 2, or 3. If ray or spark backend is used, their verbosity will be
+            affected by this argument. 0 = silent, 1 = only status updates,
+            2 = status and brief trial results, 3 = status and detailed trial results.
+            Defaults to 2.
         local_dir: A string of the local dir to save ray logs if ray backend is
             used; or a local dir to save the tuning log.
         num_samples: An integer of the number of configs to try. Defaults to 1.
@@ -380,6 +384,7 @@ def easy_objective(config):
         max_failure: int | the maximal consecutive number of failures to sample
             a trial before the tuning is terminated.
         use_ray: A boolean of whether to use ray as the backend.
+        use_spark: A boolean of whether to use spark as the backend.
         log_file_name: A string of the log file name. Default to None.
             When set to None:
                 if local_dir is not given, no log file is created;
@@ -396,17 +401,17 @@ def easy_objective(config):
             objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives.
             - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
             metric names (provided in "metric"), and the values are the numerical target values.
-            - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+            - "tolerances" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
             metric names (provided in "metrics"), and the values are the numerical tolerances values.
             E.g.,
-            ```python
-            lexico_objectives = {
-                "metrics": ["error_rate", "pred_time"],
-                "modes": ["min", "min"],
-                "tolerances": {"error_rate": 0.01, "pred_time": 0.0},
-                "targets": {"error_rate": 0.0},
-            }
-            ```
+    ```python
+    lexico_objectives = {
+        "metrics": ["error_rate", "pred_time"],
+        "modes": ["min", "min"],
+        "tolerances": {"error_rate": 0.01, "pred_time": 0.0},
+        "targets": {"error_rate": 0.0},
+    }
+    ```
         **ray_args: keyword arguments to pass to ray.tune.run().
             Only valid when use_ray=True.
     """
@@ -423,7 +428,10 @@ def easy_objective(config):
         log_file_name = os.path.join(
             local_dir, "tune_" + str(datetime.datetime.now()).replace(":", "-") + ".log"
         )
+    if use_ray and use_spark:
+        raise ValueError("use_ray and use_spark cannot be both True.")
     if not use_ray:
+        _use_ray = False
         _verbose = verbose
         old_handlers = logger.handlers
         old_level = logger.getEffectiveLevel()
@@ -443,7 +451,7 @@ def easy_objective(config):
                 logger.addHandler(logging.FileHandler(log_file_name))
             elif not logger.hasHandlers():
                 # Add the console handler.
-                _ch = logging.StreamHandler()
+                _ch = logging.StreamHandler(stream=sys.stdout)
                 logger_formatter = logging.Formatter(
                     "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s",
                     "%m-%d %H:%M:%S",
@@ -523,7 +531,7 @@ def easy_objective(config):
         if metric is None or mode is None:
             metric = metric or search_alg.metric or DEFAULT_METRIC
             mode = mode or search_alg.mode
-        if ray_import:
+        if ray_available and use_ray:
             if ray_version.startswith("1."):
                 from ray.tune.suggest import ConcurrencyLimiter
             else:
@@ -567,7 +575,7 @@ def easy_objective(config):
             params["grace_period"] = min_resource
         if reduction_factor:
             params["reduction_factor"] = reduction_factor
-        if ray_import:
+        if ray_available:
             from ray.tune.schedulers import ASHAScheduler
 
             scheduler = ASHAScheduler(**params)
@@ -605,6 +613,142 @@ def easy_objective(config):
             _running_trial = old_running_trial
             _training_iteration = old_training_iteration
 
+    if use_spark:
+        # parallel run with spark
+        from flaml.tune.spark.utils import check_spark
+
+        spark_available, spark_error_msg = check_spark()
+        if not spark_available:
+            raise spark_error_msg
+        try:
+            from pyspark.sql import SparkSession
+            from joblib import Parallel, delayed, parallel_backend
+            from joblibspark import register_spark
+        except ImportError as e:
+            raise ImportError(
+                f"{e}. Try pip install flaml[spark] or set use_spark=False."
+            )
+        from flaml.tune.searcher.suggestion import ConcurrencyLimiter
+        from .trial_runner import SparkTrialRunner
+
+        register_spark()
+        spark = SparkSession.builder.getOrCreate()
+        sc = spark._jsc.sc()
+        num_executors = (
+            len([executor.host() for executor in sc.statusTracker().getExecutorInfos()])
+            - 1
+        )
+        """
+        By default, the number of executors is the number of VMs in the cluster. And we can
+        launch one trial per executor. However, sometimes we can launch more trials than
+        the number of executors (e.g., local mode). In this case, we can set the environment
+        variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`.
+
+        `max_concurrent` is the maximum number of concurrent trials defined by `search_alg`,
+        `FLAML_MAX_CONCURRENT` will also be used to override `max_concurrent` if `search_alg`
+        is not an instance of `ConcurrencyLimiter`.
+
+        The final number of concurrent trials is the minimum of `max_concurrent` and
+        `num_executors`.
+        """
+        num_executors = max(num_executors, int(os.getenv("FLAML_MAX_CONCURRENT", 1)), 1)
+        time_start = time.time()
+        if scheduler:
+            scheduler.set_search_properties(metric=metric, mode=mode)
+        if isinstance(search_alg, ConcurrencyLimiter):
+            max_concurrent = max(1, search_alg.max_concurrent)
+        else:
+            max_concurrent = max(1, int(os.getenv("FLAML_MAX_CONCURRENT", 1)))
+
+        n_concurrent_trials = min(num_executors, max_concurrent)
+        with parallel_backend("spark"):
+            with Parallel(
+                n_jobs=n_concurrent_trials, verbose=max(0, (verbose - 1) * 50)
+            ) as parallel:
+                try:
+                    _runner = SparkTrialRunner(
+                        search_alg=search_alg,
+                        scheduler=scheduler,
+                        metric=metric,
+                        mode=mode,
+                    )
+                    num_trials = 0
+                    if time_budget_s is None:
+                        time_budget_s = np.inf
+                    fail = 0
+                    ub = (
+                        len(evaluated_rewards) if evaluated_rewards else 0
+                    ) + max_failure
+                    while (
+                        time.time() - time_start < time_budget_s
+                        and (num_samples < 0 or num_trials < num_samples)
+                        and fail < ub
+                    ):
+                        while len(_runner.running_trials) < n_concurrent_trials:
+                            # suggest trials for spark
+                            trial_next = _runner.step()
+                            if trial_next:
+                                num_trials += 1
+                            else:
+                                fail += 1  # break with ub consecutive failures
+                                logger.debug(f"consecutive failures is {fail}")
+                                if fail >= ub:
+                                    break
+                        trials_to_run = _runner.running_trials
+                        if not trials_to_run:
+                            logger.warning(
+                                f"fail to sample a trial for {max_failure} times in a row, stopping."
+                            )
+                            break
+                        logger.info(
+                            f"Number of trials: {num_trials}/{num_samples}, {len(_runner.running_trials)} RUNNING,"
+                            f" {len(_runner._trials) - len(_runner.running_trials)} TERMINATED"
+                        )
+                        logger.debug(
+                            f"Configs of Trials to run: {[trial_to_run.config for trial_to_run in trials_to_run]}"
+                        )
+                        results = parallel(
+                            delayed(evaluation_function)(trial_to_run.config)
+                            for trial_to_run in trials_to_run
+                        )
+                        # results = [evaluation_function(trial_to_run.config) for trial_to_run in trials_to_run]
+                        while results:
+                            result = results.pop(0)
+                            trial_to_run = trials_to_run[0]
+                            _runner.running_trial = trial_to_run
+                            if result is not None:
+                                if isinstance(result, dict):
+                                    if result:
+                                        logger.info(f"Brief result: {result}")
+                                        report(**result)
+                                    else:
+                                        # When the result returned is an empty dict, set the trial status to error
+                                        trial_to_run.set_status(Trial.ERROR)
+                                else:
+                                    logger.info(
+                                        "Brief result: {}".format({metric: result})
+                                    )
+                                    report(_metric=result)
+                            _runner.stop_trial(trial_to_run)
+                        fail = 0
+                    analysis = ExperimentAnalysis(
+                        _runner.get_trials(),
+                        metric=metric,
+                        mode=mode,
+                        lexico_objectives=lexico_objectives,
+                    )
+                    return analysis
+                finally:
+                    # recover the global variables in case of nested run
+                    _use_ray = old_use_ray
+                    _verbose = old_verbose
+                    _running_trial = old_running_trial
+                    _training_iteration = old_training_iteration
+                    if not use_ray:
+                        _runner = old_runner
+                        logger.handlers = old_handlers
+                        logger.setLevel(old_level)
+
     # simple sequential run without using tune.run() from ray
     time_start = time.time()
     _use_ray = False
diff --git a/notebook/automl_lightgbm.ipynb b/notebook/automl_lightgbm.ipynb
index 3b76e39c09..410912cd57 100644
--- a/notebook/automl_lightgbm.ipynb
+++ b/notebook/automl_lightgbm.ipynb
@@ -1041,7 +1041,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.12 64-bit",
+   "display_name": "Python 3.8.13 ('syml-py38')",
    "language": "python",
    "name": "python3"
   },
@@ -1055,11 +1055,11 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
+   "version": "3.8.13"
   },
   "vscode": {
    "interpreter": {
-    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
+    "hash": "e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7"
    }
   }
  },
diff --git a/notebook/integrate_azureml.ipynb b/notebook/integrate_azureml.ipynb
index b34f724fda..b7f0694f7f 100644
--- a/notebook/integrate_azureml.ipynb
+++ b/notebook/integrate_azureml.ipynb
@@ -203,7 +203,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.7 ('base')",
+   "display_name": "Python 3.8.13 ('syml-py38')",
    "language": "python",
    "name": "python3"
   },
@@ -217,11 +217,11 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.8.13"
   },
   "vscode": {
    "interpreter": {
-    "hash": "e811209110f5aa4d8c2189eeb3ff7b9b4d146931cb9189ef6041ff71605c541d"
+    "hash": "e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7"
    }
   }
  },
diff --git a/notebook/integrate_spark.ipynb b/notebook/integrate_spark.ipynb
new file mode 100644
index 0000000000..e440787b0a
--- /dev/null
+++ b/notebook/integrate_spark.ipynb
@@ -0,0 +1 @@
+{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["Copyright (c) Microsoft Corporation. All rights reserved. \n","\n","Licensed under the MIT License.\n","\n","# Run FLAML Parallel tuning with Spark\n","\n","\n","## 1. Introduction\n","\n","FLAML is a Python library (https://github.com/microsoft/FLAML) designed to automatically produce accurate machine learning models \n","with low computational cost. It is fast and economical. The simple and lightweight design makes it easy \n","to use and extend, such as adding new learners. FLAML can \n","- serve as an economical AutoML engine,\n","- be used as a fast hyperparameter tuning tool, or \n","- be embedded in self-tuning software that requires low latency & resource in repetitive\n","   tuning tasks.\n","\n","In this notebook, we demonstrate how to run FLAML parallel tuning using Spark as the backend.\n","\n","FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the following options:\n","```bash\n","pip install flaml[spark,notebook,blendsearch]>=1.1.0\n","```\n","*Spark support is added in v1.1.0*"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:16:51.6335768Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:17:21.9028602Z\",\"execution_finish_time\":\"2022-12-07T08:18:52.3646576Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["# %pip install flaml[spark,notebook,blendsearch]>=1.1.0"]},{"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["## 2. Regression Example\n","### Load data and preprocess\n","\n","Download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region."]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.4783943Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:20:55.7666047Z\",\"execution_finish_time\":\"2022-12-07T08:21:10.9050139Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"subslide"},"tags":[]},"outputs":[],"source":["from flaml.data import load_openml_dataset\n","X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')"]},{"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["### Run FLAML\n","In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. \n","\n","Notice that here `use_spark` is set to `True` in order to use Spark as the parallel training backend."]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.7001471Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:10.9846131Z\",\"execution_finish_time\":\"2022-12-07T08:21:11.3604062Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' import AutoML class from flaml package '''\n","from flaml import AutoML\n","automl = AutoML()"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.8983341Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:11.4417491Z\",\"execution_finish_time\":\"2022-12-07T08:21:11.8242955Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["settings = {\n","    \"time_budget\": 30,  # total running time in seconds\n","    \"metric\": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']\n","    \"estimator_list\": ['lgbm'],  # list of ML learners; we tune lightgbm in this example\n","    \"task\": 'regression',  # task type    \n","    \"log_file_name\": 'houses_experiment.log',  # flaml log file\n","    \"seed\": 7654321,    # random seed\n","    \"use_spark\": True,  # whether to use Spark for distributed training\n","    \"n_concurrent_trials\": 2,  # the maximum number of concurrent trials\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.3953298Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:11.9003975Z\",\"execution_finish_time\":\"2022-12-07T08:27:58.525709Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["'''The main flaml automl API'''\n","automl.fit(X_train=X_train, y_train=y_train, **settings)"]},{"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["### Best model and metric"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.789647Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:58.6014435Z\",\"execution_finish_time\":\"2022-12-07T08:27:58.9745212Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' retrieve best config'''\n","print('Best hyperparmeter config:', automl.best_config)\n","print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))\n","print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.9962623Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:59.0491242Z\",\"execution_finish_time\":\"2022-12-07T08:27:59.4076477Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["automl.model.estimator"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.2539877Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:59.5247209Z\",\"execution_finish_time\":\"2022-12-07T08:28:00.4849272Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["import matplotlib.pyplot as plt\n","plt.barh(automl.feature_names_in_, automl.feature_importances_)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.5182783Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:00.5644015Z\",\"execution_finish_time\":\"2022-12-07T08:28:01.5531147Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["''' pickle and save the automl object '''\n","import pickle\n","with open('automl.pkl', 'wb') as f:\n","    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.803107Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:01.6350567Z\",\"execution_finish_time\":\"2022-12-07T08:28:02.5774117Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' compute predictions of testing dataset ''' \n","y_pred = automl.predict(X_test)\n","print('Predicted labels', y_pred)\n","print('True labels', y_test)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.0585537Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:02.6537337Z\",\"execution_finish_time\":\"2022-12-07T08:28:03.0177805Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' compute different metric values on testing dataset'''\n","from flaml.ml import sklearn_metric_loss_score\n","print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))\n","print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))\n","print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.2226463Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:03.1150781Z\",\"execution_finish_time\":\"2022-12-07T08:28:03.4858362Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"subslide"},"tags":[]},"outputs":[],"source":["from flaml.data import get_output_from_log\n","time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n","    get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n","\n","for config in config_history:\n","    print(config)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.4020235Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:03.5811012Z\",\"execution_finish_time\":\"2022-12-07T08:28:04.5493292Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["import numpy as np\n","\n","plt.title('Learning Curve')\n","plt.xlabel('Wall Clock Time (s)')\n","plt.ylabel('Validation r2')\n","plt.scatter(time_history, 1 - np.array(valid_loss_history))\n","plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["## 3. Add a customized LightGBM learner in FLAML\n","The native API of LightGBM allows one to specify a custom objective function in the model constructor. You can easily enable it by adding a customized LightGBM learner in FLAML. In the following example, we show how to add such a customized LightGBM learner with a custom objective function for parallel tuning with Spark.\n","\n","It's a little bit different from adding customized learners for sequential training. In sequential training, we can define the customized learner in a notebook cell. However, in spark training, we have to import it from a file so that Spark can use it in executors. We can easily do it by leveraging `broadcast_code` function in `flaml.tune.spark.utils`."]},{"cell_type":"markdown","metadata":{},"source":["### Create a customized LightGBM learner with a custom objective function"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:09:49.540914Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:09:49.6259637Z\",\"execution_finish_time\":\"2022-12-07T09:09:50.5841239Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["custom_code = \"\"\"\n","import numpy as np \n","from flaml.model import LGBMEstimator\n","from flaml import tune\n","\n","\n","''' define your customized objective function '''\n","def my_loss_obj(y_true, y_pred):\n","    c = 0.5\n","    residual = y_pred - y_true\n","    grad = c * residual /(np.abs(residual) + c)\n","    hess = c ** 2 / (np.abs(residual) + c) ** 2\n","    # rmse grad and hess\n","    grad_rmse = residual\n","    hess_rmse = 1.0\n","    \n","    # mae grad and hess\n","    grad_mae = np.array(residual)\n","    grad_mae[grad_mae > 0] = 1.\n","    grad_mae[grad_mae <= 0] = -1.\n","    hess_mae = 1.0\n","\n","    coef = [0.4, 0.3, 0.3]\n","    return coef[0] * grad + coef[1] * grad_rmse + coef[2] * grad_mae, \\\n","        coef[0] * hess + coef[1] * hess_rmse + coef[2] * hess_mae\n","\n","\n","''' create a customized LightGBM learner class with your objective function '''\n","class MyLGBM(LGBMEstimator):\n","    '''LGBMEstimator with my_loss_obj as the objective function\n","    '''\n","\n","    def __init__(self, **config):\n","        super().__init__(objective=my_loss_obj, **config)\n","\"\"\"\n","\n","from flaml.tune.spark.utils import broadcast_code\n","custom_learner_path = broadcast_code(custom_code=custom_code)\n","print(custom_learner_path)\n","from flaml.tune.spark.mylearner import MyLGBM"]},{"cell_type":"markdown","metadata":{},"source":["### Add the customized learner in FLAML"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:14:16.2449566Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:14:16.3227204Z\",\"execution_finish_time\":\"2022-12-07T09:16:49.7573919Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","tags":[]},"outputs":[],"source":["automl = AutoML()\n","automl.add_learner(learner_name='my_lgbm', learner_class=MyLGBM)\n","settings = {\n","    \"time_budget\": 30,  # total running time in seconds\n","    \"metric\": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2']\n","    \"estimator_list\": ['my_lgbm',],  # list of ML learners; we tune lightgbm in this example\n","    \"task\": 'regression',  # task type    \n","    \"log_file_name\": 'houses_experiment_my_lgbm.log',  # flaml log file\n","    \"n_concurrent_trials\": 2,\n","    \"use_spark\": True,\n","}\n","automl.fit(X_train=X_train, y_train=y_train, **settings)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:17:06.0159529Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:17:06.1042554Z\",\"execution_finish_time\":\"2022-12-07T09:17:06.467989Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","tags":[]},"outputs":[],"source":["print('Best hyperparmeter config:', automl.best_config)\n","print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))\n","print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))\n","\n","y_pred = automl.predict(X_test)\n","print('Predicted labels', y_pred)\n","print('True labels', y_test)\n","\n","from flaml.ml import sklearn_metric_loss_score\n","print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))\n","print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))\n","print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))"]},{"cell_type":"code","execution_count":null,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":[]}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Python 3.8.13 ('syml-py38')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.13 (default, Oct 21 2022, 23:50:54) \n[GCC 11.2.0]"},"notebook_environment":{},"save_output":true,"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.livy.synapse.ipythonInterpreter.enabled":"true"},"enableDebugMode":false,"keepAliveTimeout":30}},"synapse_widget":{"state":{},"version":"0.1"},"trident":{"lakehouse":{}},"vscode":{"interpreter":{"hash":"e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7"}}},"nbformat":4,"nbformat_minor":0}
diff --git a/notebook/tune_pytorch.ipynb b/notebook/tune_pytorch.ipynb
index d90f4fb9c1..93153ac50c 100644
--- a/notebook/tune_pytorch.ipynb
+++ b/notebook/tune_pytorch.ipynb
@@ -347,7 +347,11 @@
     "        best_trained_model = nn.DataParallel(best_trained_model)\n",
     "best_trained_model.to(device)\n",
     "\n",
-    "checkpoint_path = os.path.join(best_trial.checkpoint.value, \"checkpoint\")\n",
+    "checkpoint_value = (\n",
+    "    getattr(best_trial.checkpoint, \"dir_or_data\", None)\n",
+    "    or best_trial.checkpoint.value\n",
+    ")\n",
+    "checkpoint_path = os.path.join(checkpoint_value, \"checkpoint\")\n",
     "\n",
     "model_state, optimizer_state = torch.load(checkpoint_path)\n",
     "best_trained_model.load_state_dict(model_state)\n",
@@ -358,11 +362,9 @@
   }
  ],
  "metadata": {
-  "interpreter": {
-   "hash": "f7771e6a3915580179405189f5aa4eb9047494cbe4e005b29b851351b54902f6"
-  },
   "kernelspec": {
-   "display_name": "Python 3.8.10 64-bit ('venv': venv)",
+   "display_name": "Python 3.11.0 64-bit",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
@@ -375,12 +377,17 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.11.0"
   },
   "metadata": {
    "interpreter": {
     "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
    }
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
   }
  },
  "nbformat": 4,
diff --git a/setup.py b/setup.py
index 3e4dadd62b..b798035c29 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,10 @@
             "matplotlib",
             "openml==0.10.2",
         ],
+        "spark": [
+            "pyspark>=3.0.0",
+            "joblibspark>=0.5.0",
+        ],
         "test": [
             "flake8>=3.8.4",
             "thop",
@@ -68,6 +72,10 @@
             "matplotlib",
             "pytorch-forecasting>=0.9.0,<=0.10.1",
             "mlflow",
+            "pyspark>=3.0.0",
+            "joblibspark>=0.5.0",
+            "nbconvert",
+            "nbformat",
         ],
         "catboost": ["catboost>=0.26"],
         "blendsearch": ["optuna==2.8.0"],
diff --git a/test/automl/__init__.py b/test/automl/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/automl/test_multiclass.py b/test/automl/test_multiclass.py
index f6bb383964..b9f7b177a5 100644
--- a/test/automl/test_multiclass.py
+++ b/test/automl/test_multiclass.py
@@ -146,6 +146,18 @@ def test_custom_learner(self):
         MyRegularizedGreedyForest.search_space = lambda data_size, task: {}
         automl.fit(X_train=X_train, y_train=y_train, **settings)
 
+        try:
+            import ray
+
+            del settings["time_budget"]
+            settings["max_iter"] = 5
+            # test the "_choice_" issue when using ray
+            automl.fit(
+                X_train=X_train, y_train=y_train, n_concurrent_trials=2, **settings
+            )
+        except ImportError:
+            return
+
     def test_ensemble(self):
         automl = AutoML()
         automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
@@ -171,8 +183,8 @@ def test_dataframe(self):
     def test_custom_metric(self):
         df, y = load_iris(return_X_y=True, as_frame=True)
         df["label"] = y
-        automl_experiment = AutoML()
-        automl_settings = {
+        automl = AutoML()
+        settings = {
             "dataframe": df,
             "label": "label",
             "time_budget": 5,
@@ -188,16 +200,16 @@ def test_custom_metric(self):
             "pred_time_limit": 1e-5,
             "ensemble": True,
         }
-        automl_experiment.fit(**automl_settings)
-        print(automl_experiment.classes_)
-        print(automl_experiment.model)
-        print(automl_experiment.config_history)
-        print(automl_experiment.best_model_for_estimator("rf"))
-        print(automl_experiment.best_iteration)
-        print(automl_experiment.best_estimator)
-        automl_experiment = AutoML()
-        estimator = automl_experiment.get_estimator_from_log(
-            automl_settings["log_file_name"], record_id=0, task="multiclass"
+        automl.fit(**settings)
+        print(automl.classes_)
+        print(automl.model)
+        print(automl.config_history)
+        print(automl.best_model_for_estimator("rf"))
+        print(automl.best_iteration)
+        print(automl.best_estimator)
+        automl = AutoML()
+        estimator = automl.get_estimator_from_log(
+            settings["log_file_name"], record_id=0, task="multiclass"
         )
         print(estimator)
         (
@@ -206,17 +218,20 @@ def test_custom_metric(self):
             valid_loss_history,
             config_history,
             metric_history,
-        ) = get_output_from_log(
-            filename=automl_settings["log_file_name"], time_budget=6
-        )
+        ) = get_output_from_log(filename=settings["log_file_name"], time_budget=6)
         print(metric_history)
         try:
             import ray
 
             df = ray.put(df)
-            automl_settings["dataframe"] = df
-            automl_settings["use_ray"] = True
-            automl_experiment.fit(**automl_settings)
+            settings["dataframe"] = df
+            settings["use_ray"] = True
+            del settings["time_budget"]
+            settings["max_iter"] = 2
+            automl.fit(**settings)
+            estimator = automl.get_estimator_from_log(
+                settings["log_file_name"], record_id=1, task="multiclass"
+            )
         except ImportError:
             pass
 
@@ -319,8 +334,8 @@ def test_roc_auc_ovo(self):
         automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
 
     def test_roc_auc_ovr_weighted(self):
-        automl_experiment = AutoML()
-        automl_settings = {
+        automl = AutoML()
+        settings = {
             "time_budget": 1,
             "metric": "roc_auc_ovr_weighted",
             "task": "classification",
@@ -330,7 +345,7 @@ def test_roc_auc_ovr_weighted(self):
             "model_history": True,
         }
         X_train, y_train = load_iris(return_X_y=True)
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        automl.fit(X_train=X_train, y_train=y_train, **settings)
 
     def test_roc_auc_ovo_weighted(self):
         automl_experiment = AutoML()
@@ -415,10 +430,10 @@ def test_time_limit(self):
         automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
         print(automl_experiment.model)
 
-    def test_fit_w_starting_point(self, as_frame=True):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 3,
+    def test_fit_w_starting_point(self, as_frame=True, n_concurrent_trials=1):
+        automl = AutoML()
+        settings = {
+            "max_iter": 3,
             "metric": "accuracy",
             "task": "classification",
             "log_file_name": "test/iris.log",
@@ -431,21 +446,26 @@ def test_fit_w_starting_point(self, as_frame=True):
             # test drop column
             X_train.columns = range(X_train.shape[1])
             X_train[X_train.shape[1]] = np.zeros(len(y_train))
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        automl_val_accuracy = 1.0 - automl_experiment.best_loss
-        print("Best ML leaner:", automl_experiment.best_estimator)
-        print("Best hyperparmeter config:", automl_experiment.best_config)
+        automl.fit(
+            X_train=X_train,
+            y_train=y_train,
+            n_concurrent_trials=n_concurrent_trials,
+            **settings
+        )
+        automl_val_accuracy = 1.0 - automl.best_loss
+        print("Best ML leaner:", automl.best_estimator)
+        print("Best hyperparmeter config:", automl.best_config)
         print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
         print(
             "Training duration of best run: {0:.4g} s".format(
-                automl_experiment.best_config_train_time
+                automl.best_config_train_time
             )
         )
 
-        starting_points = automl_experiment.best_config_per_estimator
+        starting_points = automl.best_config_per_estimator
         print("starting_points", starting_points)
-        print("loss of the starting_points", automl_experiment.best_loss_per_estimator)
-        automl_settings_resume = {
+        print("loss of the starting_points", automl.best_loss_per_estimator)
+        settings_resume = {
             "time_budget": 2,
             "metric": "accuracy",
             "task": "classification",
@@ -456,27 +476,34 @@ def test_fit_w_starting_point(self, as_frame=True):
             "log_type": "all",
             "starting_points": starting_points,
         }
-        new_automl_experiment = AutoML()
-        new_automl_experiment.fit(
-            X_train=X_train, y_train=y_train, **automl_settings_resume
-        )
+        new_automl = AutoML()
+        new_automl.fit(X_train=X_train, y_train=y_train, **settings_resume)
 
-        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
-        print("Best ML leaner:", new_automl_experiment.best_estimator)
-        print("Best hyperparmeter config:", new_automl_experiment.best_config)
+        new_automl_val_accuracy = 1.0 - new_automl.best_loss
+        print("Best ML leaner:", new_automl.best_estimator)
+        print("Best hyperparmeter config:", new_automl.best_config)
         print(
             "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
         )
         print(
             "Training duration of best run: {0:.4g} s".format(
-                new_automl_experiment.best_config_train_time
+                new_automl.best_config_train_time
             )
         )
 
-    def test_fit_w_starting_points_list(self, as_frame=True):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 3,
+    def test_fit_w_starting_point_2(self, as_frame=True):
+        try:
+            import ray
+
+            self.test_fit_w_starting_points_list(as_frame, 2)
+            self.test_fit_w_starting_point(as_frame, 2)
+        except ImportError:
+            pass
+
+    def test_fit_w_starting_points_list(self, as_frame=True, n_concurrent_trials=1):
+        automl = AutoML()
+        settings = {
+            "max_iter": 3,
             "metric": "accuracy",
             "task": "classification",
             "log_file_name": "test/iris.log",
@@ -489,19 +516,24 @@ def test_fit_w_starting_points_list(self, as_frame=True):
             # test drop column
             X_train.columns = range(X_train.shape[1])
             X_train[X_train.shape[1]] = np.zeros(len(y_train))
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        automl_val_accuracy = 1.0 - automl_experiment.best_loss
-        print("Best ML leaner:", automl_experiment.best_estimator)
-        print("Best hyperparmeter config:", automl_experiment.best_config)
+        automl.fit(
+            X_train=X_train,
+            y_train=y_train,
+            n_concurrent_trials=n_concurrent_trials,
+            **settings
+        )
+        automl_val_accuracy = 1.0 - automl.best_loss
+        print("Best ML leaner:", automl.best_estimator)
+        print("Best hyperparmeter config:", automl.best_config)
         print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
         print(
             "Training duration of best run: {0:.4g} s".format(
-                automl_experiment.best_config_train_time
+                automl.best_config_train_time
             )
         )
 
         starting_points = {}
-        log_file_name = automl_settings["log_file_name"]
+        log_file_name = settings["log_file_name"]
         with training_log_reader(log_file_name) as reader:
             sample_size = 1000
             for record in reader.records():
@@ -513,7 +545,7 @@ def test_fit_w_starting_points_list(self, as_frame=True):
                     starting_points[learner] = []
                 starting_points[learner].append(config)
         max_iter = sum([len(s) for k, s in starting_points.items()])
-        automl_settings_resume = {
+        settings_resume = {
             "time_budget": 2,
             "metric": "accuracy",
             "task": "classification",
@@ -526,14 +558,12 @@ def test_fit_w_starting_points_list(self, as_frame=True):
             "starting_points": starting_points,
             "append_log": True,
         }
-        new_automl_experiment = AutoML()
-        new_automl_experiment.fit(
-            X_train=X_train, y_train=y_train, **automl_settings_resume
-        )
+        new_automl = AutoML()
+        new_automl.fit(X_train=X_train, y_train=y_train, **settings_resume)
 
-        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
-        # print('Best ML leaner:', new_automl_experiment.best_estimator)
-        # print('Best hyperparmeter config:', new_automl_experiment.best_config)
+        new_automl_val_accuracy = 1.0 - new_automl.best_loss
+        # print('Best ML leaner:', new_automl.best_estimator)
+        # print('Best hyperparmeter config:', new_automl.best_config)
         print(
             "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
         )
diff --git a/test/automl/test_python_log.py b/test/automl/test_python_log.py
index 3681fcd455..5a1e9ec153 100644
--- a/test/automl/test_python_log.py
+++ b/test/automl/test_python_log.py
@@ -96,7 +96,7 @@ def test_logging_level(self):
             )
             print(min(trial.last_result["val_loss"] for trial in analysis.trials))
             config = analysis.trials[-1].last_result["config"]["ml"]
-            automl._state._train_with_config(config["learner"], config)
+            automl._state._train_with_config(config.pop("learner"), config)
             for _ in range(3):
                 print(
                     search_alg._ls.complete_config(
diff --git a/test/automl/test_training_log.py b/test/automl/test_training_log.py
index ff1b426735..d8949e6d12 100644
--- a/test/automl/test_training_log.py
+++ b/test/automl/test_training_log.py
@@ -40,7 +40,9 @@ def test_training_log(
             if automl.best_estimator:
                 estimator, config = automl.best_estimator, automl.best_config
                 model0 = automl.best_model_for_estimator(estimator)
-                print(model0.params["n_estimators"], config)
+                print(model0.params)
+                if "n_estimators" in config:
+                    assert model0.params["n_estimators"] == config["n_estimators"]
 
                 # train on full data with no time limit
                 automl._state.time_budget = -1
diff --git a/test/spark/__init__.py b/test/spark/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/spark/custom_mylearner.py b/test/spark/custom_mylearner.py
new file mode 100644
index 0000000000..0ccc159ca9
--- /dev/null
+++ b/test/spark/custom_mylearner.py
@@ -0,0 +1,124 @@
+from flaml.tune.spark.utils import broadcast_code
+
+custom_code = """
+from flaml import tune
+from flaml.automl.model import LGBMEstimator, XGBoostSklearnEstimator, SKLearnEstimator
+from flaml.automl.data import CLASSIFICATION, get_output_from_log
+
+class MyRegularizedGreedyForest(SKLearnEstimator):
+    def __init__(self, task="binary", **config):
+
+        super().__init__(task, **config)
+
+        if task in CLASSIFICATION:
+            from rgf.sklearn import RGFClassifier
+
+            self.estimator_class = RGFClassifier
+        else:
+            from rgf.sklearn import RGFRegressor
+
+            self.estimator_class = RGFRegressor
+
+    @classmethod
+    def search_space(cls, data_size, task):
+        space = {
+            "max_leaf": {
+                "domain": tune.lograndint(lower=4, upper=data_size[0]),
+                "init_value": 4,
+            },
+            "n_iter": {
+                "domain": tune.lograndint(lower=1, upper=data_size[0]),
+                "init_value": 1,
+            },
+            "n_tree_search": {
+                "domain": tune.lograndint(lower=1, upper=32768),
+                "init_value": 1,
+            },
+            "opt_interval": {
+                "domain": tune.lograndint(lower=1, upper=10000),
+                "init_value": 100,
+            },
+            "learning_rate": {"domain": tune.loguniform(lower=0.01, upper=20.0)},
+            "min_samples_leaf": {
+                "domain": tune.lograndint(lower=1, upper=20),
+                "init_value": 20,
+            },
+        }
+        return space
+
+    @classmethod
+    def size(cls, config):
+        max_leaves = int(round(config.get("max_leaf", 1)))
+        n_estimators = int(round(config.get("n_iter", 1)))
+        return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
+
+    @classmethod
+    def cost_relative2lgbm(cls):
+        return 1.0
+
+
+class MyLargeXGB(XGBoostSklearnEstimator):
+    @classmethod
+    def search_space(cls, **params):
+        return {
+            "n_estimators": {
+                "domain": tune.lograndint(lower=4, upper=32768),
+                "init_value": 32768,
+                "low_cost_init_value": 4,
+            },
+            "max_leaves": {
+                "domain": tune.lograndint(lower=4, upper=3276),
+                "init_value": 3276,
+                "low_cost_init_value": 4,
+            },
+        }
+
+
+class MyLargeLGBM(LGBMEstimator):
+    @classmethod
+    def search_space(cls, **params):
+        return {
+            "n_estimators": {
+                "domain": tune.lograndint(lower=4, upper=32768),
+                "init_value": 32768,
+                "low_cost_init_value": 4,
+            },
+            "num_leaves": {
+                "domain": tune.lograndint(lower=4, upper=3276),
+                "init_value": 3276,
+                "low_cost_init_value": 4,
+            },
+        }
+
+
+def custom_metric(
+    X_val,
+    y_val,
+    estimator,
+    labels,
+    X_train,
+    y_train,
+    weight_val=None,
+    weight_train=None,
+    config=None,
+    groups_val=None,
+    groups_train=None,
+):
+    from sklearn.metrics import log_loss
+    import time
+
+    start = time.time()
+    y_pred = estimator.predict_proba(X_val)
+    pred_time = (time.time() - start) / len(X_val)
+    val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
+    y_pred = estimator.predict_proba(X_train)
+    train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
+    alpha = 0.5
+    return val_loss * (1 + alpha) - alpha * train_loss, {
+        "val_loss": val_loss,
+        "train_loss": train_loss,
+        "pred_time": pred_time,
+    }
+"""
+
+_ = broadcast_code(custom_code=custom_code)
diff --git a/test/spark/mylearner.py b/test/spark/mylearner.py
new file mode 100644
index 0000000000..980e371eea
--- /dev/null
+++ b/test/spark/mylearner.py
@@ -0,0 +1,19 @@
+from flaml.automl.model import LGBMEstimator
+from flaml import tune
+
+
+class MyLargeLGBM(LGBMEstimator):
+    @classmethod
+    def search_space(cls, **params):
+        return {
+            "n_estimators": {
+                "domain": tune.lograndint(lower=4, upper=32768),
+                "init_value": 32768,
+                "low_cost_init_value": 4,
+            },
+            "num_leaves": {
+                "domain": tune.lograndint(lower=4, upper=32768),
+                "init_value": 32768,
+                "low_cost_init_value": 4,
+            },
+        }
diff --git a/test/spark/test_automl.py b/test/spark/test_automl.py
new file mode 100644
index 0000000000..79801c79e6
--- /dev/null
+++ b/test/spark/test_automl.py
@@ -0,0 +1,108 @@
+import numpy as np
+import scipy.sparse
+from flaml import AutoML
+from flaml.tune.spark.utils import check_spark
+import os
+import pytest
+
+# For spark, we need to put customized learner in a separate file
+if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "mylearner.py")):
+    try:
+        from test.spark.mylearner import MyLargeLGBM
+
+        skip_my_learner = False
+    except ImportError:
+        skip_my_learner = True
+        MyLargeLGBM = None
+else:
+    MyLargeLGBM = None
+    skip_my_learner = True
+
+os.environ["FLAML_MAX_CONCURRENT"] = "2"
+
+spark_available, _ = check_spark()
+skip_spark = not spark_available
+
+pytestmark = pytest.mark.skipif(
+    skip_spark, reason="Spark is not installed. Skip all spark tests."
+)
+
+
+def test_parallel_xgboost(hpo_method=None, data_size=1000):
+    automl_experiment = AutoML()
+    automl_settings = {
+        "time_budget": 10,
+        "metric": "ap",
+        "task": "classification",
+        "log_file_name": "test/sparse_classification.log",
+        "estimator_list": ["xgboost"],
+        "log_type": "all",
+        "n_jobs": 1,
+        "n_concurrent_trials": 2,
+        "hpo_method": hpo_method,
+        "use_spark": True,
+    }
+    X_train = scipy.sparse.eye(data_size)
+    y_train = np.random.randint(2, size=data_size)
+
+    automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+    print(automl_experiment.predict(X_train))
+    print(automl_experiment.model)
+    print(automl_experiment.config_history)
+    print(automl_experiment.best_model_for_estimator("xgboost"))
+    print(automl_experiment.best_iteration)
+    print(automl_experiment.best_estimator)
+
+
+def test_parallel_xgboost_others():
+    # use random search as the hpo_method
+    test_parallel_xgboost(hpo_method="random")
+
+
+@pytest.mark.skip(
+    reason="currently not supporting too large data, will support spark dataframe in the future"
+)
+def test_large_dataset():
+    test_parallel_xgboost(data_size=90000000)
+
+
+@pytest.mark.skipif(
+    skip_my_learner,
+    reason="please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
+)
+def test_custom_learner(data_size=1000):
+    automl_experiment = AutoML()
+    automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
+    automl_settings = {
+        "time_budget": 2,
+        "task": "classification",
+        "log_file_name": "test/sparse_classification_oom.log",
+        "estimator_list": ["large_lgbm"],
+        "log_type": "all",
+        "n_jobs": 1,
+        "hpo_method": "random",
+        "n_concurrent_trials": 2,
+        "use_spark": True,
+    }
+    X_train = scipy.sparse.eye(data_size)
+    y_train = np.random.randint(2, size=data_size)
+
+    automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+    print(automl_experiment.predict(X_train))
+    print(automl_experiment.model)
+    print(automl_experiment.config_history)
+    print(automl_experiment.best_model_for_estimator("large_lgbm"))
+    print(automl_experiment.best_iteration)
+    print(automl_experiment.best_estimator)
+
+
+if __name__ == "__main__":
+    test_parallel_xgboost()
+    test_parallel_xgboost_others()
+    # test_large_dataset()
+    if skip_my_learner:
+        print(
+            "please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file"
+        )
+    else:
+        test_custom_learner()
diff --git a/test/spark/test_ensemble.py b/test/spark/test_ensemble.py
new file mode 100644
index 0000000000..42199c267b
--- /dev/null
+++ b/test/spark/test_ensemble.py
@@ -0,0 +1,57 @@
+import unittest
+from sklearn.datasets import load_wine
+from flaml import AutoML
+from flaml.tune.spark.utils import check_spark
+import os
+
+spark_available, _ = check_spark()
+skip_spark = not spark_available
+
+os.environ["FLAML_MAX_CONCURRENT"] = "2"
+
+# To solve pylint issue, we put code for customizing mylearner in a separate file
+if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "custom_mylearner.py")):
+    try:
+        from test.spark.custom_mylearner import *
+        from flaml.tune.spark.mylearner import MyRegularizedGreedyForest
+
+        skip_my_learner = False
+    except ImportError:
+        skip_my_learner = True
+else:
+    skip_my_learner = True
+
+
+class TestEnsemble(unittest.TestCase):
+    def setUp(self) -> None:
+        if skip_spark:
+            self.skipTest("Spark is not installed. Skip all spark tests.")
+
+    @unittest.skipIf(
+        skip_my_learner,
+        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
+    )
+    def test_ensemble(self):
+        automl = AutoML()
+        automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
+        X_train, y_train = load_wine(return_X_y=True)
+        settings = {
+            "time_budget": 5,  # total running time in seconds
+            "estimator_list": ["rf", "xgboost", "catboost"],
+            "task": "classification",  # task type
+            "sample": True,  # whether to subsample training data
+            "log_file_name": "test/wine.log",
+            "log_training_metric": True,  # whether to log training metric
+            "ensemble": {
+                "final_estimator": MyRegularizedGreedyForest(),
+                "passthrough": False,
+            },
+            "n_jobs": 1,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        automl.fit(X_train=X_train, y_train=y_train, **settings)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/spark/test_exceptions.py b/test/spark/test_exceptions.py
new file mode 100644
index 0000000000..0c31b8aded
--- /dev/null
+++ b/test/spark/test_exceptions.py
@@ -0,0 +1,76 @@
+from flaml.automl.data import load_openml_dataset
+from flaml import AutoML
+from flaml.tune.spark.utils import check_spark
+import os
+import pytest
+
+spark_available, _ = check_spark()
+skip_spark = not spark_available
+
+pytestmark = pytest.mark.skipif(
+    skip_spark, reason="Spark is not installed. Skip all spark tests."
+)
+
+os.environ["FLAML_MAX_CONCURRENT"] = "2"
+
+
+def base_automl(n_concurrent_trials=1, use_ray=False, use_spark=False, verbose=0):
+    X_train, X_test, y_train, y_test = load_openml_dataset(
+        dataset_id=537, data_dir="./"
+    )
+    automl = AutoML()
+    settings = {
+        "time_budget": 3,  # total running time in seconds
+        "metric": "r2",  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
+        "estimator_list": ["lgbm", "rf", "xgboost"],  # list of ML learners
+        "task": "regression",  # task type
+        "log_file_name": "houses_experiment.log",  # flaml log file
+        "seed": 7654321,  # random seed
+        "n_concurrent_trials": n_concurrent_trials,  # the maximum number of concurrent learners
+        "use_ray": use_ray,  # whether to use Ray for distributed training
+        "use_spark": use_spark,  # whether to use Spark for distributed training
+        "verbose": verbose,
+    }
+
+    automl.fit(X_train=X_train, y_train=y_train, **settings)
+
+    print("Best ML leaner:", automl.best_estimator)
+    print("Best hyperparmeter config:", automl.best_config)
+    print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
+    print(
+        "Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
+    )
+
+
+def test_both_ray_spark():
+    with pytest.raises(ValueError):
+        base_automl(n_concurrent_trials=2, use_ray=True, use_spark=True)
+
+
+def test_verboses():
+    for verbose in [1, 3, 5]:
+        base_automl(verbose=verbose)
+
+
+def test_import_error():
+    from importlib import reload
+    import flaml.tune.spark.utils as utils
+
+    reload(utils)
+    utils._have_spark = False
+    spark_available, spark_error_msg = utils.check_spark()
+    assert not spark_available
+    assert isinstance(spark_error_msg, ImportError)
+
+    reload(utils)
+    utils._spark_major_minor_version = (1, 1)
+    spark_available, spark_error_msg = utils.check_spark()
+    assert not spark_available
+    assert isinstance(spark_error_msg, ImportError)
+
+    reload(utils)
+
+
+if __name__ == "__main__":
+    base_automl()
+    test_import_error()
diff --git a/test/spark/test_multiclass.py b/test/spark/test_multiclass.py
new file mode 100644
index 0000000000..9a2a3950a0
--- /dev/null
+++ b/test/spark/test_multiclass.py
@@ -0,0 +1,470 @@
+import unittest
+import numpy as np
+import scipy.sparse
+from sklearn.datasets import load_iris, load_wine
+from flaml import AutoML
+from flaml.automl.data import CLASSIFICATION, get_output_from_log
+from flaml.automl.training_log import training_log_reader
+from flaml.tune.spark.utils import check_spark
+import os
+
+spark_available, _ = check_spark()
+skip_spark = not spark_available
+
+os.environ["FLAML_MAX_CONCURRENT"] = "2"
+
+# To solve pylint issue, we put code for customizing mylearner in a separate file
+if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "custom_mylearner.py")):
+    try:
+        from test.spark.custom_mylearner import *
+        from flaml.tune.spark.mylearner import (
+            MyRegularizedGreedyForest,
+            custom_metric,
+            MyLargeLGBM,
+            MyLargeXGB,
+        )
+
+        skip_my_learner = False
+    except ImportError:
+        skip_my_learner = True
+else:
+    skip_my_learner = True
+
+
+class TestMultiClass(unittest.TestCase):
+    def setUp(self) -> None:
+        if skip_spark:
+            self.skipTest("Spark is not installed. Skip all spark tests.")
+
+    @unittest.skipIf(
+        skip_my_learner,
+        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
+    )
+    def test_custom_learner(self):
+        automl = AutoML()
+        automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
+        X_train, y_train = load_wine(return_X_y=True)
+        settings = {
+            "time_budget": 8,  # total running time in seconds
+            "estimator_list": ["RGF", "lgbm", "rf", "xgboost"],
+            "task": "classification",  # task type
+            "sample": True,  # whether to subsample training data
+            "log_file_name": "test/wine.log",
+            "log_training_metric": True,  # whether to log training metric
+            "n_jobs": 1,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+            "verbose": 4,
+        }
+        automl.fit(X_train=X_train, y_train=y_train, **settings)
+        # print the best model found for RGF
+        print(automl.best_model_for_estimator("RGF"))
+
+        MyRegularizedGreedyForest.search_space = lambda data_size, task: {}
+        automl.fit(X_train=X_train, y_train=y_train, **settings)
+
+    @unittest.skipIf(
+        skip_my_learner,
+        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
+    )
+    def test_custom_metric(self):
+        df, y = load_iris(return_X_y=True, as_frame=True)
+        df["label"] = y
+        automl_experiment = AutoML()
+        automl_settings = {
+            "dataframe": df,
+            "label": "label",
+            "time_budget": 5,
+            "eval_method": "cv",
+            "metric": custom_metric,
+            "task": "classification",
+            "log_file_name": "test/iris_custom.log",
+            "log_training_metric": True,
+            "log_type": "all",
+            "n_jobs": 1,
+            "model_history": True,
+            "sample_weight": np.ones(len(y)),
+            "pred_time_limit": 1e-5,
+            # "ensemble": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        automl_experiment.fit(**automl_settings)
+        print(automl_experiment.classes_)
+        print(automl_experiment.model)
+        print(automl_experiment.config_history)
+        print(automl_experiment.best_model_for_estimator("rf"))
+        print(automl_experiment.best_iteration)
+        print(automl_experiment.best_estimator)
+        automl_experiment = AutoML()
+        estimator = automl_experiment.get_estimator_from_log(
+            automl_settings["log_file_name"], record_id=0, task="multiclass"
+        )
+        print(estimator)
+        (
+            time_history,
+            best_valid_loss_history,
+            valid_loss_history,
+            config_history,
+            metric_history,
+        ) = get_output_from_log(
+            filename=automl_settings["log_file_name"], time_budget=6
+        )
+        print(metric_history)
+
+    def test_classification(self, as_frame=False):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 4,
+            "metric": "accuracy",
+            "task": "classification",
+            "log_file_name": "test/iris.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
+        if as_frame:
+            # test drop column
+            X_train.columns = range(X_train.shape[1])
+            X_train[X_train.shape[1]] = np.zeros(len(y_train))
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        print(automl_experiment.classes_)
+        print(automl_experiment.predict(X_train)[:5])
+        print(automl_experiment.model)
+        print(automl_experiment.config_history)
+        print(automl_experiment.best_model_for_estimator("catboost"))
+        print(automl_experiment.best_iteration)
+        print(automl_experiment.best_estimator)
+        del automl_settings["metric"]
+        del automl_settings["model_history"]
+        del automl_settings["log_training_metric"]
+        automl_experiment = AutoML(task="classification")
+        duration = automl_experiment.retrain_from_log(
+            log_file_name=automl_settings["log_file_name"],
+            X_train=X_train,
+            y_train=y_train,
+            train_full=True,
+            record_id=0,
+        )
+        print(duration)
+        print(automl_experiment.model)
+        print(automl_experiment.predict_proba(X_train)[:5])
+
+    def test_micro_macro_f1(self):
+        automl_experiment_micro = AutoML()
+        automl_experiment_macro = AutoML()
+        automl_settings = {
+            "time_budget": 2,
+            "task": "classification",
+            "log_file_name": "test/micro_macro_f1.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True)
+        automl_experiment_micro.fit(
+            X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings
+        )
+        automl_experiment_macro.fit(
+            X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings
+        )
+        estimator = automl_experiment_macro.model
+        y_pred = estimator.predict(X_train)
+        y_pred_proba = estimator.predict_proba(X_train)
+        from flaml.automl.ml import norm_confusion_matrix, multi_class_curves
+
+        print(norm_confusion_matrix(y_train, y_pred))
+        from sklearn.metrics import roc_curve, precision_recall_curve
+
+        print(multi_class_curves(y_train, y_pred_proba, roc_curve))
+        print(multi_class_curves(y_train, y_pred_proba, precision_recall_curve))
+
+    def test_roc_auc_ovr(self):
+        automl_experiment = AutoML()
+        X_train, y_train = load_iris(return_X_y=True)
+        automl_settings = {
+            "time_budget": 1,
+            "metric": "roc_auc_ovr",
+            "task": "classification",
+            "log_file_name": "test/roc_auc_ovr.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "sample_weight": np.ones(len(y_train)),
+            "eval_method": "holdout",
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+
+    def test_roc_auc_ovo(self):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 1,
+            "metric": "roc_auc_ovo",
+            "task": "classification",
+            "log_file_name": "test/roc_auc_ovo.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True)
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+
+    def test_roc_auc_ovr_weighted(self):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 1,
+            "metric": "roc_auc_ovr_weighted",
+            "task": "classification",
+            "log_file_name": "test/roc_auc_weighted.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True)
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+
+    def test_roc_auc_ovo_weighted(self):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 1,
+            "metric": "roc_auc_ovo_weighted",
+            "task": "classification",
+            "log_file_name": "test/roc_auc_weighted.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True)
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+
+    def test_sparse_matrix_classification(self):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 2,
+            "metric": "auto",
+            "task": "classification",
+            "log_file_name": "test/sparse_classification.log",
+            "split_type": "uniform",
+            "n_jobs": 1,
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train = scipy.sparse.random(1554, 21, dtype=int)
+        y_train = np.random.randint(3, size=1554)
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        print(automl_experiment.classes_)
+        print(automl_experiment.predict_proba(X_train))
+        print(automl_experiment.model)
+        print(automl_experiment.config_history)
+        print(automl_experiment.best_model_for_estimator("extra_tree"))
+        print(automl_experiment.best_iteration)
+        print(automl_experiment.best_estimator)
+
+    @unittest.skipIf(
+        skip_my_learner,
+        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
+    )
+    def _test_memory_limit(self):
+        automl_experiment = AutoML()
+        automl_experiment.add_learner(
+            learner_name="large_lgbm", learner_class=MyLargeLGBM
+        )
+        automl_settings = {
+            "time_budget": -1,
+            "task": "classification",
+            "log_file_name": "test/classification_oom.log",
+            "estimator_list": ["large_lgbm"],
+            "log_type": "all",
+            "hpo_method": "random",
+            "free_mem_ratio": 0.2,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True, as_frame=True)
+
+        automl_experiment.fit(
+            X_train=X_train, y_train=y_train, max_iter=1, **automl_settings
+        )
+        print(automl_experiment.model)
+
+    @unittest.skipIf(
+        skip_my_learner,
+        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
+    )
+    def test_time_limit(self):
+        automl_experiment = AutoML()
+        automl_experiment.add_learner(
+            learner_name="large_lgbm", learner_class=MyLargeLGBM
+        )
+        automl_experiment.add_learner(
+            learner_name="large_xgb", learner_class=MyLargeXGB
+        )
+        automl_settings = {
+            "time_budget": 0.5,
+            "task": "classification",
+            "log_file_name": "test/classification_timeout.log",
+            "estimator_list": ["catboost"],
+            "log_type": "all",
+            "hpo_method": "random",
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True, as_frame=True)
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        print(automl_experiment.model.params)
+        automl_settings["estimator_list"] = ["large_xgb"]
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        print(automl_experiment.model)
+        automl_settings["estimator_list"] = ["large_lgbm"]
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        print(automl_experiment.model)
+
+    def test_fit_w_starting_point(self, as_frame=True):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 3,
+            "metric": "accuracy",
+            "task": "classification",
+            "log_file_name": "test/iris.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
+        if as_frame:
+            # test drop column
+            X_train.columns = range(X_train.shape[1])
+            X_train[X_train.shape[1]] = np.zeros(len(y_train))
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        automl_val_accuracy = 1.0 - automl_experiment.best_loss
+        print("Best ML leaner:", automl_experiment.best_estimator)
+        print("Best hyperparmeter config:", automl_experiment.best_config)
+        print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
+        print(
+            "Training duration of best run: {0:.4g} s".format(
+                automl_experiment.best_config_train_time
+            )
+        )
+
+        starting_points = automl_experiment.best_config_per_estimator
+        print("starting_points", starting_points)
+        print("loss of the starting_points", automl_experiment.best_loss_per_estimator)
+        automl_settings_resume = {
+            "time_budget": 2,
+            "metric": "accuracy",
+            "task": "classification",
+            "log_file_name": "test/iris_resume.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "log_type": "all",
+            "starting_points": starting_points,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        new_automl_experiment = AutoML()
+        new_automl_experiment.fit(
+            X_train=X_train, y_train=y_train, **automl_settings_resume
+        )
+
+        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
+        print("Best ML leaner:", new_automl_experiment.best_estimator)
+        print("Best hyperparmeter config:", new_automl_experiment.best_config)
+        print(
+            "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
+        )
+        print(
+            "Training duration of best run: {0:.4g} s".format(
+                new_automl_experiment.best_config_train_time
+            )
+        )
+
+    def test_fit_w_starting_points_list(self, as_frame=True):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 3,
+            "metric": "accuracy",
+            "task": "classification",
+            "log_file_name": "test/iris.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "model_history": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
+        if as_frame:
+            # test drop column
+            X_train.columns = range(X_train.shape[1])
+            X_train[X_train.shape[1]] = np.zeros(len(y_train))
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        automl_val_accuracy = 1.0 - automl_experiment.best_loss
+        print("Best ML leaner:", automl_experiment.best_estimator)
+        print("Best hyperparmeter config:", automl_experiment.best_config)
+        print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
+        print(
+            "Training duration of best run: {0:.4g} s".format(
+                automl_experiment.best_config_train_time
+            )
+        )
+
+        starting_points = {}
+        log_file_name = automl_settings["log_file_name"]
+        with training_log_reader(log_file_name) as reader:
+            sample_size = 1000
+            for record in reader.records():
+                config = record.config
+                config["FLAML_sample_size"] = sample_size
+                sample_size += 1000
+                learner = record.learner
+                if learner not in starting_points:
+                    starting_points[learner] = []
+                starting_points[learner].append(config)
+        max_iter = sum([len(s) for k, s in starting_points.items()])
+        automl_settings_resume = {
+            "time_budget": 2,
+            "metric": "accuracy",
+            "task": "classification",
+            "log_file_name": "test/iris_resume_all.log",
+            "log_training_metric": True,
+            "n_jobs": 1,
+            "max_iter": max_iter,
+            "model_history": True,
+            "log_type": "all",
+            "starting_points": starting_points,
+            "append_log": True,
+            "n_concurrent_trials": 2,
+            "use_spark": True,
+        }
+        new_automl_experiment = AutoML()
+        new_automl_experiment.fit(
+            X_train=X_train, y_train=y_train, **automl_settings_resume
+        )
+
+        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
+        # print('Best ML leaner:', new_automl_experiment.best_estimator)
+        # print('Best hyperparmeter config:', new_automl_experiment.best_config)
+        print(
+            "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
+        )
+        # print('Training duration of best run: {0:.4g} s'.format(new_automl_experiment.best_config_train_time))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/spark/test_notebook.py b/test/spark/test_notebook.py
new file mode 100644
index 0000000000..d900d76f70
--- /dev/null
+++ b/test/spark/test_notebook.py
@@ -0,0 +1,41 @@
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+from nbconvert.preprocessors import CellExecutionError
+from flaml.tune.spark.utils import check_spark
+import os
+import pytest
+
+spark_available, _ = check_spark()
+skip_spark = not spark_available
+
+pytestmark = pytest.mark.skipif(
+    skip_spark, reason="Spark is not installed. Skip all spark tests."
+)
+
+here = os.path.abspath(os.path.dirname(__file__))
+os.environ["FLAML_MAX_CONCURRENT"] = "2"
+
+
+def run_notebook(input_nb, output_nb="executed_notebook.ipynb", save=False):
+    try:
+        file_path = os.path.join(here, os.pardir, os.pardir, "notebook", input_nb)
+        with open(file_path) as f:
+            nb = nbformat.read(f, as_version=4)
+        ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
+        ep.preprocess(nb, {"metadata": {"path": here}})
+    except CellExecutionError:
+        raise
+    except Exception as e:
+        print("\nIgnoring below error:\n", e, "\n\n")
+    finally:
+        if save:
+            with open(os.path.join(here, output_nb), "w", encoding="utf-8") as f:
+                nbformat.write(nb, f)
+
+
+def test_automl_lightgbm_test():
+    run_notebook("integrate_spark.ipynb")
+
+
+if __name__ == "__main__":
+    test_automl_lightgbm_test()
diff --git a/test/spark/test_performance.py b/test/spark/test_performance.py
new file mode 100644
index 0000000000..2bf72b9454
--- /dev/null
+++ b/test/spark/test_performance.py
@@ -0,0 +1,110 @@
+import sys
+from openml.exceptions import OpenMLServerException
+from requests.exceptions import ChunkedEncodingError, SSLError
+from flaml.tune.spark.utils import check_spark
+import os
+import pytest
+
+spark_available, _ = check_spark()
+skip_spark = not spark_available
+
+pytestmark = pytest.mark.skipif(
+    skip_spark, reason="Spark is not installed. Skip all spark tests."
+)
+
+os.environ["FLAML_MAX_CONCURRENT"] = "2"
+
+
+def run_automl(budget=3, dataset_format="dataframe", hpo_method=None):
+    from flaml.automl.data import load_openml_dataset
+    import urllib3
+
+    performance_check_budget = 3600
+    if sys.platform == "darwin" or "nt" in os.name or "3.10" not in sys.version:
+        budget = 3  # revise the buget if the platform is not linux + python 3.10
+    if budget >= performance_check_budget:
+        max_iter = 60
+        performance_check_budget = None
+    else:
+        max_iter = None
+    try:
+        X_train, X_test, y_train, y_test = load_openml_dataset(
+            dataset_id=1169, data_dir="test/", dataset_format=dataset_format
+        )
+    except (
+        OpenMLServerException,
+        ChunkedEncodingError,
+        urllib3.exceptions.ReadTimeoutError,
+        SSLError,
+    ) as e:
+        print(e)
+        return
+
+    """ import AutoML class from flaml package """
+    from flaml import AutoML
+
+    automl = AutoML()
+    settings = {
+        "time_budget": budget,  # total running time in seconds
+        "max_iter": max_iter,  # maximum number of iterations
+        "metric": "accuracy",  # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
+        "task": "classification",  # task type
+        "log_file_name": "airlines_experiment.log",  # flaml log file
+        "seed": 7654321,  # random seed
+        "hpo_method": hpo_method,
+        "log_type": "all",
+        "estimator_list": [
+            "lgbm",
+            "xgboost",
+            "xgb_limitdepth",
+            "rf",
+            "extra_tree",
+        ],  # list of ML learners
+        "eval_method": "holdout",
+        "n_concurrent_trials": 2,
+        "use_spark": True,
+    }
+
+    """The main flaml automl API"""
+    automl.fit(X_train=X_train, y_train=y_train, **settings)
+
+    """ retrieve best config and best learner """
+    print("Best ML leaner:", automl.best_estimator)
+    print("Best hyperparmeter config:", automl.best_config)
+    print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
+    print(
+        "Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
+    )
+    print(automl.model.estimator)
+    print(automl.best_config_per_estimator)
+    print("time taken to find best model:", automl.time_to_find_best_model)
+
+    """ compute predictions of testing dataset """
+    y_pred = automl.predict(X_test)
+    print("Predicted labels", y_pred)
+    print("True labels", y_test)
+    y_pred_proba = automl.predict_proba(X_test)[:, 1]
+    """ compute different metric values on testing dataset """
+    from flaml.automl.ml import sklearn_metric_loss_score
+
+    accuracy = 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test)
+    print("accuracy", "=", accuracy)
+    print(
+        "roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test)
+    )
+    print("log_loss", "=", sklearn_metric_loss_score("log_loss", y_pred_proba, y_test))
+    if performance_check_budget is None:
+        assert accuracy >= 0.669, "the accuracy of flaml should be larger than 0.67"
+
+
+def test_automl_array():
+    run_automl(3, "array", "bs")
+
+
+def test_automl_performance():
+    run_automl(3600)
+
+
+if __name__ == "__main__":
+    test_automl_array()
+    test_automl_performance()
diff --git a/test/spark/test_tune.py b/test/spark/test_tune.py
new file mode 100644
index 0000000000..bbd482c821
--- /dev/null
+++ b/test/spark/test_tune.py
@@ -0,0 +1,58 @@
+import lightgbm as lgb
+import numpy as np
+from sklearn.datasets import load_breast_cancer
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from flaml import tune
+from flaml.automl.model import LGBMEstimator
+from flaml.tune.spark.utils import check_spark
+import os
+import pytest
+
+spark_available, _ = check_spark()
+skip_spark = not spark_available
+
+pytestmark = pytest.mark.skipif(
+    skip_spark, reason="Spark is not installed. Skip all spark tests."
+)
+
+os.environ["FLAML_MAX_CONCURRENT"] = "2"
+X, y = load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
+
+
+def train_breast_cancer(config):
+    params = LGBMEstimator(**config).params
+    train_set = lgb.Dataset(X_train, label=y_train)
+    gbm = lgb.train(params, train_set)
+    preds = gbm.predict(X_test)
+    pred_labels = np.rint(preds)
+    result = {
+        "mean_accuracy": accuracy_score(y_test, pred_labels),
+    }
+    return result
+
+
+def test_tune_spark():
+    flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
+    config_search_space = {
+        hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
+    }
+
+    analysis = tune.run(
+        train_breast_cancer,
+        metric="mean_accuracy",
+        mode="max",
+        config=config_search_space,
+        num_samples=-1,
+        time_budget_s=5,
+        use_spark=True,
+        verbose=3,
+    )
+
+    # print("Best hyperparameters found were: ", analysis.best_config)
+    print("The best trial's result: ", analysis.best_trial.last_result)
+
+
+if __name__ == "__main__":
+    test_tune_spark()
diff --git a/test/spark/test_utils.py b/test/spark/test_utils.py
new file mode 100644
index 0000000000..757f458b54
--- /dev/null
+++ b/test/spark/test_utils.py
@@ -0,0 +1,101 @@
+from flaml.tune.spark.utils import (
+    with_parameters,
+    check_spark,
+    get_n_cpus,
+    get_broadcast_data,
+)
+from functools import partial
+from timeit import timeit
+import pytest
+
+try:
+    from pyspark.sql import SparkSession
+    import pyspark
+
+    spark_available, _ = check_spark()
+    skip_spark = not spark_available
+except ImportError:
+    print("Spark is not installed. Skip all spark tests.")
+    skip_spark = True
+
+pytestmark = pytest.mark.skipif(
+    skip_spark, reason="Spark is not installed. Skip all spark tests."
+)
+
+
+def test_with_parameters_spark():
+    def train(config, data=None):
+        if isinstance(data, pyspark.broadcast.Broadcast):
+            data = data.value
+        print(config, len(data))
+
+    data = ["a"] * 10**6
+
+    with_parameters_train = with_parameters(train, data=data)
+    partial_train = partial(train, data=data)
+
+    spark = SparkSession.builder.getOrCreate()
+    rdd = spark.sparkContext.parallelize(list(range(2)))
+
+    t_partial = timeit(
+        lambda: rdd.map(lambda x: partial_train(config=x)).collect(), number=5
+    )
+    print("python_partial_train: " + str(t_partial))
+
+    t_spark = timeit(
+        lambda: rdd.map(lambda x: with_parameters_train(config=x)).collect(),
+        number=5,
+    )
+    print("spark_with_parameters_train: " + str(t_spark))
+
+    # assert t_spark < t_partial
+
+
+def test_get_n_cpus_spark():
+    n_cpus = get_n_cpus()
+    assert isinstance(n_cpus, int)
+
+
+def test_broadcast_code():
+    from flaml.tune.spark.utils import broadcast_code
+    from flaml.automl.model import LGBMEstimator
+
+    custom_code = """
+    from flaml.automl.model import LGBMEstimator
+    from flaml import tune
+
+    class MyLargeLGBM(LGBMEstimator):
+        @classmethod
+        def search_space(cls, **params):
+            return {
+                "n_estimators": {
+                    "domain": tune.lograndint(lower=4, upper=32768),
+                    "init_value": 32768,
+                    "low_cost_init_value": 4,
+                },
+                "num_leaves": {
+                    "domain": tune.lograndint(lower=4, upper=32768),
+                    "init_value": 32768,
+                    "low_cost_init_value": 4,
+                },
+            }
+    """
+
+    _ = broadcast_code(custom_code=custom_code)
+    from flaml.tune.spark.mylearner import MyLargeLGBM
+
+    assert isinstance(MyLargeLGBM(), LGBMEstimator)
+
+
+def test_get_broadcast_data():
+    data = ["a"] * 10
+    spark = SparkSession.builder.getOrCreate()
+    bc_data = spark.sparkContext.broadcast(data)
+    assert get_broadcast_data(bc_data) == data
+
+
+if __name__ == "__main__":
+    test_with_parameters_spark()
+    test_get_n_cpus_spark()
+    test_broadcast_code()
+    test_get_broadcast_data()
diff --git a/test/tune/test_pytorch_cifar10.py b/test/tune/test_pytorch_cifar10.py
index 2151bf281d..188d9750fb 100644
--- a/test/tune/test_pytorch_cifar10.py
+++ b/test/tune/test_pytorch_cifar10.py
@@ -313,7 +313,11 @@ def cifar10_main(
             best_trained_model = nn.DataParallel(best_trained_model)
     best_trained_model.to(device)
 
-    checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")
+    checkpoint_value = (
+        getattr(best_trial.checkpoint, "dir_or_data", None)
+        or best_trial.checkpoint.value
+    )
+    checkpoint_path = os.path.join(checkpoint_value, "checkpoint")
 
     model_state, optimizer_state = torch.load(checkpoint_path)
     best_trained_model.load_state_dict(model_state)
diff --git a/test/tune/test_searcher.py b/test/tune/test_searcher.py
index 440c08ab0b..ff29b522cf 100644
--- a/test/tune/test_searcher.py
+++ b/test/tune/test_searcher.py
@@ -194,8 +194,8 @@ def test_searcher():
     searcher.on_trial_complete("t2", None, True)
     searcher.suggest("t3")
     searcher.on_trial_complete("t3", {"m": np.nan})
-    searcher.save("test/tune/optuna.pickle")
-    searcher.restore("test/tune/optuna.pickle")
+    searcher.save("test/tune/optuna.pkl")
+    searcher.restore("test/tune/optuna.pkl")
     try:
         searcher = BlendSearch(
             metric="m", global_search_alg=searcher, metric_constraints=[("c", "<", 1)]
diff --git a/website/docs/Examples/Tune-Lexicographic-objectives.md b/website/docs/Examples/Tune-Lexicographic-objectives.md
index c7fff54631..b215c37282 100644
--- a/website/docs/Examples/Tune-Lexicographic-objectives.md
+++ b/website/docs/Examples/Tune-Lexicographic-objectives.md
@@ -5,6 +5,7 @@
 ```python
 pip install "flaml>=1.1.0" thop torchvision torch
 ```
+Tuning multiple objectives with Lexicographic preference is a new feature added in version 1.1.0 and is subject to change in future versions.
 
 ## Tuning accurate and efficient neural networks with lexicographic preference
 
@@ -162,4 +163,4 @@ analysis = tune.run(
 ```
 
 
-[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb)
\ No newline at end of file
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb)
diff --git a/website/docs/Examples/Tune-PyTorch.md b/website/docs/Examples/Tune-PyTorch.md
index 83f38e6098..d75c716c7f 100644
--- a/website/docs/Examples/Tune-PyTorch.md
+++ b/website/docs/Examples/Tune-PyTorch.md
@@ -261,7 +261,8 @@ if torch.cuda.is_available():
         best_trained_model = nn.DataParallel(best_trained_model)
 best_trained_model.to(device)
 
-checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")
+checkpoint_value = getattr(best_trial.checkpoint, "dir_or_data", None) or best_trial.checkpoint.value
+checkpoint_path = os.path.join(checkpoint_value, "checkpoint")
 
 model_state, optimizer_state = torch.load(checkpoint_path)
 best_trained_model.load_state_dict(model_state)
@@ -283,4 +284,4 @@ Files already downloaded and verified
 Best trial test set accuracy: 0.6294
 ```
 
-[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb)
\ No newline at end of file
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb)
diff --git a/website/docs/FAQ.md b/website/docs/FAQ.md
index 2fdbcd2fd2..232e390c05 100644
--- a/website/docs/FAQ.md
+++ b/website/docs/FAQ.md
@@ -66,3 +66,16 @@ Packages such as `azureml-interpret` and `sklearn.inspection.permutation_importa
 Model explanation is frequently asked and adding a native support may be a good feature. Suggestions/contributions are welcome.
 
 Optimization history can be checked from the [log](Use-Cases/Task-Oriented-AutoML#log-the-trials). You can also [retrieve the log and plot the learning curve](Use-Cases/Task-Oriented-AutoML#plot-learning-curve).
+
+
+### How to resolve out-of-memory error in `AutoML.fit()`
+
+* Set `free_mem_ratio` a float between 0 and 1. For example, 0.2 means try to keep free memory above 20% of total memory. Training may be early stopped for memory consumption reason when this is set.
+* Set `model_history` False.
+* If your data are already preprocessed, set `skip_transform` False. If you can preprocess the data before the fit starts, this setting can save memory needed for preprocessing in `fit`.
+* If the OOM error only happens for some particular trials:
+    - set `use_ray` True. This will increase the overhead per trial but can keep the AutoML process running when a single trial fails due to OOM error.
+    - provide a more accurate [`size`](reference/automl/model#size) function for the memory bytes consumption of each config for the estimator causing this error.
+    - modify the [search space](Use-Cases/Task-Oriented-AutoML#a-shortcut-to-override-the-search-space) for the estimators causing this error.
+    - or remove this estimator from the `estimator_list`.
+* If the OOM error happens when ensembling, consider disabling ensemble, or use a cheaper ensemble option. ([Example](Use-Cases/Task-Oriented-AutoML#ensemble)).
diff --git a/website/docs/Installation.md b/website/docs/Installation.md
index 7cc37943a1..76ad85c561 100644
--- a/website/docs/Installation.md
+++ b/website/docs/Installation.md
@@ -50,6 +50,28 @@ pip install flaml[nlp]
 ```bash
 pip install flaml[ray]
 ```
+* spark
+> *Spark support is added in v1.1.0*
+```bash
+pip install flaml[spark]>=1.1.0
+```
+
+For cloud platforms such as [Azure Synapse](https://azure.microsoft.com/en-us/products/synapse-analytics/), Spark clusters are provided.
+But you may also need to install `Spark` manually when setting up your own environment.
+For latest Ubuntu system, you can install Spark 3.3.0 standalone version with below script.
+For more details of installing Spark, please refer to [Spark Doc](https://spark.apache.org/docs/latest/api/python/getting_started/install.html).
+```bash
+sudo apt-get update && sudo apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+    ca-certificates-java ca-certificates openjdk-17-jdk-headless \
+    && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*
+wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz?action=download" \
+    -O - | tar -xzC /tmp; archive=$(basename "spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz") \
+    bash -c "sudo mv -v /tmp/\${archive/%.tgz/} /spark"
+export SPARK_HOME=/spark
+export PYTHONPATH=/spark/python/lib/py4j-0.10.9.5-src.zip:/spark/python
+export PATH=$PATH:$SPARK_HOME/bin
+```
+
 * nni
 ```bash
 pip install flaml[nni]
diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md
index 94025e57e8..1ec2522526 100644
--- a/website/docs/Use-Cases/Task-Oriented-AutoML.md
+++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md
@@ -382,7 +382,11 @@ and have ``split`` and ``get_n_splits`` methods with the same signatures.  To di
 
 When you have parallel resources, you can either spend them in training and keep the model search sequential, or perform parallel search. Following scikit-learn, the parameter `n_jobs` specifies how many CPU cores to use for each training job. The number of parallel trials is specified via the parameter `n_concurrent_trials`. By default, `n_jobs=-1, n_concurrent_trials=1`. That is, all the CPU cores (in a single compute node) are used for training a single model and the search is sequential. When you have more resources than what each single training job needs, you can consider increasing `n_concurrent_trials`.
 
-To do parallel tuning, install the `ray` and `blendsearch` options:
+FLAML now support two backends for parallel tuning, i.e., `Ray` and `Spark`. You can use either of them, but not both for one tuning job.
+
+#### Parallel tuning with Ray
+
+To do parallel tuning with Ray, install the `ray` and `blendsearch` options:
 ```bash
 pip install flaml[ray,blendsearch]
 ```
@@ -397,6 +401,23 @@ automl.fit(X_train, y_train, n_jobs=4, n_concurrent_trials=4)
 ```
 flaml will perform 4 trials in parallel, each consuming 4 CPU cores. The parallel tuning uses the [BlendSearch](Tune-User-Defined-Function##blendsearch-economical-hyperparameter-optimization-with-blended-search-strategy) algorithm.
 
+#### Parallel tuning with Spark
+
+To do parallel tuning with Spark, install the `spark` and `blendsearch` options:
+
+> *Spark support is added in v1.1.0*
+```bash
+pip install flaml[spark,blendsearch]>=1.1.0
+```
+
+For more details about installing Spark, please refer to [Installation](../Installation#Distributed-tuning).
+
+An example of using Spark for parallel tuning is:
+```python
+automl.fit(X_train, y_train, n_concurrent_trials=4, use_spark=True)
+```
+For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`. Also, GPU training is not supported yet when use_spark is True.
+
 #### **Guidelines on parallel vs sequential tuning**
 
 **(1) Considerations on wall-clock time.**
diff --git a/website/docs/Use-Cases/Tune-User-Defined-Function.md b/website/docs/Use-Cases/Tune-User-Defined-Function.md
index 39f04eb863..858d2d07c7 100644
--- a/website/docs/Use-Cases/Tune-User-Defined-Function.md
+++ b/website/docs/Use-Cases/Tune-User-Defined-Function.md
@@ -100,14 +100,14 @@ If it is a numerical hyperparameter, you need to know whether it takes integer v
 
 ```python
 {
-"learning_rate": tune.loguniform(lower=1 / 1024, upper=1.0),
+    "learning_rate": tune.loguniform(lower=1 / 1024, upper=1.0),
 }
 ```
 When the search range of learning rate is small, it is more common to sample in the linear scale as shown in the following example,
 
 ```python
 {
-"learning_rate": tune.uniform(lower=0.1, upper=0.2),
+    "learning_rate": tune.uniform(lower=0.1, upper=0.2),
 }
 ```
 
@@ -117,7 +117,7 @@ When the search range of learning rate is small, it is more common to sample in
 When you have a desired quantization granularity for the hyperparameter change, you can use `tune.qlograndint` or `tune.qloguniform` to realize the quantization requirement. The following code example helps you realize the need for sampling uniformly in the range of 0.1 and 0.2 with increments of 0.02, i.e., the sampled learning rate can only take values in {0.1, 0.12, 0.14, 0.16, ..., 0.2},
 ```python
 {
-"learning_rate": tune.uniform(lower=0.1, upper=0.2, q=0.02),
+    "learning_rate": tune.quniform(lower=0.1, upper=0.2, q=0.02),
 }
 ```
 
@@ -290,10 +290,13 @@ The key difference between these two types of constraints is that the calculatio
 Related arguments:
 
 - `use_ray`: A boolean of whether to use ray as the backend.
+- `use_spark`: A boolean of whether to use spark as the backend.
 - `resources_per_trial`: A dictionary of the hardware resources to allocate per trial, e.g., `{'cpu': 1}`. Only valid when using ray backend.
 
 
-You can perform parallel tuning by specifying `use_ray=True` (requiring flaml[ray] option installed). You can also limit the amount of resources allocated per trial by specifying `resources_per_trial`, e.g., `resources_per_trial={'cpu': 2}`.
+You can perform parallel tuning by specifying `use_ray=True` (requiring flaml[ray] option installed) or `use_spark=True`
+(requiring flaml[spark] option installed). You can also limit the amount of resources allocated per trial by specifying `resources_per_trial`,
+e.g., `resources_per_trial={'cpu': 2}` when `use_ray=True`.
 
 ```python
 # require: pip install flaml[ray]
@@ -311,6 +314,21 @@ print(analysis.best_trial.last_result)  # the best trial's result
 print(analysis.best_config)  # the best config
 ```
 
+```python
+# require: pip install flaml[spark]
+analysis = tune.run(
+    evaluate_config,  # the function to evaluate a config
+    config=config_search_space,  # the search space defined
+    metric="score",
+    mode="min",  # the optimization mode, "min" or "max"
+    num_samples=-1,  # the maximal number of configs to try, -1 means infinite
+    time_budget_s=10,  # the time budget in seconds
+    use_spark=True,
+)
+print(analysis.best_trial.last_result)  # the best trial's result
+print(analysis.best_config)  # the best config
+```
+
 **A headsup about computation overhead.** When parallel tuning is used, there will be a certain amount of computation overhead in each trial. In case each trial's original cost is much smaller than the overhead, parallel tuning can underperform sequential tuning. Sequential tuning is recommended when compute resource is limited, and each trial can consume all the resources.
 
 
@@ -529,7 +547,7 @@ In the following example, we want to minimize `val_loss` and `pred_time` of the
 ```python
 lexico_objectives = {}
 lexico_objectives["metrics"] = ["val_loss", "pred_time"]
-lexico_objectives["pred_time"] = ["min", "min"]
+lexico_objectives["modes"] = ["min", "min"]
 lexico_objectives["tolerances"] = {"val_loss": 0.02, "pred_time": 0.0}
 lexico_objectives["targets"] = {"val_loss": -float('inf'), "pred_time": -float('inf')}
 

From 7e95240498a3ec4850197ed0685bc12e41a32ee7 Mon Sep 17 00:00:00 2001
From: Qingyun Wu <qingyun.wu@psu.edu>
Date: Fri, 23 Dec 2022 22:33:16 -0500
Subject: [PATCH 16/17] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dd2c4a71f3..436112d4cf 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,4 +18,4 @@ repos:
     - id: check-merge-conflict
     - id: detect-private-key
     - id: trailing-whitespace
-    # - id: no-commit-to-branch
\ No newline at end of file
+    - id: no-commit-to-branch

From 229ffc7994404caa64e208f7464020770635036a Mon Sep 17 00:00:00 2001
From: Qingyun Wu <qingyun.wu@psu.edu>
Date: Fri, 23 Dec 2022 19:51:35 -0800
Subject: [PATCH 17/17] revert

---
 .github/workflows/python-package.yml          |  11 +-
 .pre-commit-config.yaml                       |   2 +-
 Dockerfile                                    |  10 -
 README.md                                     |   4 +-
 flaml/automl/automl.py                        | 305 ++++--------
 flaml/tune/searcher/blendsearch.py            |  11 +-
 flaml/tune/searcher/flow2.py                  |   2 +-
 flaml/tune/spark/__init__.py                  |   8 -
 flaml/tune/spark/utils.py                     | 191 -------
 flaml/tune/trial_runner.py                    |  38 --
 flaml/tune/tune.py                            | 178 +------
 notebook/automl_lightgbm.ipynb                |   6 +-
 notebook/integrate_azureml.ipynb              |   6 +-
 notebook/integrate_spark.ipynb                |   1 -
 notebook/tune_pytorch.ipynb                   |  19 +-
 setup.py                                      |   8 -
 test/automl/__init__.py                       |   0
 test/automl/test_multiclass.py                | 148 +++---
 test/automl/test_python_log.py                |   2 +-
 test/automl/test_training_log.py              |   4 +-
 test/spark/__init__.py                        |   0
 test/spark/custom_mylearner.py                | 124 -----
 test/spark/mylearner.py                       |  19 -
 test/spark/test_automl.py                     | 108 ----
 test/spark/test_ensemble.py                   |  57 ---
 test/spark/test_exceptions.py                 |  76 ---
 test/spark/test_multiclass.py                 | 470 ------------------
 test/spark/test_notebook.py                   |  41 --
 test/spark/test_performance.py                | 110 ----
 test/spark/test_tune.py                       |  58 ---
 test/spark/test_utils.py                      | 101 ----
 test/tune/test_pytorch_cifar10.py             |   6 +-
 test/tune/test_searcher.py                    |   4 +-
 .../Examples/Tune-Lexicographic-objectives.md |   3 +-
 website/docs/Examples/Tune-PyTorch.md         |   5 +-
 website/docs/FAQ.md                           |  13 -
 website/docs/Installation.md                  |  22 -
 .../docs/Use-Cases/Task-Oriented-AutoML.md    |  23 +-
 .../Use-Cases/Tune-User-Defined-Function.md   |  28 +-
 39 files changed, 204 insertions(+), 2018 deletions(-)
 delete mode 100644 flaml/tune/spark/__init__.py
 delete mode 100644 flaml/tune/spark/utils.py
 delete mode 100644 notebook/integrate_spark.ipynb
 delete mode 100644 test/automl/__init__.py
 delete mode 100644 test/spark/__init__.py
 delete mode 100644 test/spark/custom_mylearner.py
 delete mode 100644 test/spark/mylearner.py
 delete mode 100644 test/spark/test_automl.py
 delete mode 100644 test/spark/test_ensemble.py
 delete mode 100644 test/spark/test_exceptions.py
 delete mode 100644 test/spark/test_multiclass.py
 delete mode 100644 test/spark/test_notebook.py
 delete mode 100644 test/spark/test_performance.py
 delete mode 100644 test/spark/test_tune.py
 delete mode 100644 test/spark/test_utils.py

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index dfc784a058..51fcc40d79 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-2019]
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
 
     steps:
       - uses: actions/checkout@v3
@@ -37,15 +37,6 @@ jobs:
           export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
           export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
           export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp"
-      - name: On Linux, install Spark stand-alone cluster and PySpark
-        if: matrix.os == 'ubuntu-latest'
-        run: |
-          sudo apt-get update && sudo apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends ca-certificates-java ca-certificates openjdk-17-jdk-headless && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*
-          wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz?action=download" -O - | tar -xzC /tmp; archive=$(basename "spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz") bash -c "sudo mv -v /tmp/\${archive/%.tgz/} /spark"
-          pip install --no-cache-dir pyspark>=3.0
-          export SPARK_HOME=/spark
-          export PYTHONPATH=/spark/python/lib/py4j-0.10.9.5-src.zip:/spark/python
-          export PATH=$PATH:$SPARK_HOME/bin
       - name: Install packages and dependencies
         run: |
           python -m pip install --upgrade pip wheel
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 436112d4cf..dd2c4a71f3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,4 +18,4 @@ repos:
     - id: check-merge-conflict
     - id: detect-private-key
     - id: trailing-whitespace
-    - id: no-commit-to-branch
+    # - id: no-commit-to-branch
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 4f0a63aa89..bd358f2316 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,16 +3,6 @@ FROM python:3.7
 RUN apt-get update && apt-get -y update
 RUN apt-get install -y sudo git npm
 
-# Install Spark
-RUN sudo apt-get update && sudo apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
-        ca-certificates-java ca-certificates openjdk-17-jdk-headless \
-        wget \
-    && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*
-RUN wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz?action=download" -O - | tar -xzC /tmp; archive=$(basename "spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz") bash -c "sudo mv -v /tmp/\${archive/%.tgz/} /spark"
-ENV SPARK_HOME=/spark \
-    PYTHONPATH=/spark/python/lib/py4j-0.10.9.5-src.zip:/spark/python
-ENV PATH="${PATH}:${SPARK_HOME}/bin"
-
 # Setup user to not run as root
 RUN adduser --disabled-password --gecos '' flaml-dev
 RUN adduser flaml-dev sudo
diff --git a/README.md b/README.md
index 71362b0239..163ca56550 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ Use the following guides to get started with FLAML in .NET:
 
 - [Install Model Builder](https://docs.microsoft.com/dotnet/machine-learning/how-to-guides/install-model-builder?tabs=visual-studio-2022)
 - [Install ML.NET CLI](https://docs.microsoft.com/dotnet/machine-learning/how-to-guides/install-ml-net-cli?tabs=windows)
-- [Microsoft.AutoML](https://www.nuget.org/packages/Microsoft.ML.AutoML/0.20.0)
+- [Microsoft.AutoML](https://www.nuget.org/packages/Microsoft.ML.AutoML/0.20.0-preview.22313.1)
 
 ## Quickstart
 
@@ -107,7 +107,7 @@ In addition, you can find:
 
 - Contributing guide [here](https://microsoft.github.io/FLAML/docs/Contribute).
 
-- ML.NET documentation and tutorials for [Model Builder](https://learn.microsoft.com/dotnet/machine-learning/tutorials/predict-prices-with-model-builder), [ML.NET CLI](https://learn.microsoft.com/dotnet/machine-learning/tutorials/sentiment-analysis-cli), and [AutoML API](https://learn.microsoft.com/dotnet/machine-learning/how-to-guides/how-to-use-the-automl-api).
+- ML.NET documentation and tutorials for [Model Builder](https://docs.microsoft.com/dotnet/machine-learning/tutorials/predict-prices-with-model-builder), [ML.NET CLI](https://docs.microsoft.com/en-us/dotnet/machine-learning/tutorials/sentiment-analysis-cli), and [AutoML API](https://github.com/dotnet/csharp-notebooks/blob/main/machine-learning/03-Training%20and%20AutoML.ipynb).
 
 ## Contributing
 
diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py
index 72fea04d75..7e50aa2122 100644
--- a/flaml/automl/automl.py
+++ b/flaml/automl/automl.py
@@ -4,7 +4,6 @@
 #  * project root for license information.
 import time
 import os
-import sys
 from typing import Callable, Optional, List, Union, Any
 import inspect
 from functools import partial
@@ -55,28 +54,17 @@
 from flaml.automl.training_log import training_log_reader, training_log_writer
 from flaml.default import suggest_learner
 from flaml.version import __version__ as flaml_version
-from flaml.tune.spark.utils import check_spark, get_broadcast_data
 
 logger = logging.getLogger(__name__)
 logger_formatter = logging.Formatter(
     "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
 )
-logger.propagate = False
 
 try:
     import mlflow
 except ImportError:
     mlflow = None
 
-try:
-    from ray import __version__ as ray_version
-
-    assert ray_version >= "1.10.0"
-
-    ray_available = True
-except (ImportError, AssertionError):
-    ray_available = False
-
 
 class SearchState:
     @property
@@ -153,34 +141,31 @@ def __init__(
         if custom_hp is not None:
             search_space.update(custom_hp)
 
-        if isinstance(starting_point, dict):
-            starting_point = AutoMLState.sanitize(starting_point)
-            if max_iter > 1 and not self.valid_starting_point(
-                starting_point, search_space
-            ):
-                # If the number of iterations is larger than 1, remove invalid point
+        if (
+            isinstance(starting_point, dict)
+            and max_iter
+            > 1  # If the number of starting point is larger than max iter, avoid the checking
+            and not self.valid_starting_point(starting_point, search_space)
+        ):
+            logger.warning(
+                "Starting point {} removed because it is outside of the search space".format(
+                    starting_point
+                )
+            )
+            starting_point = None
+        elif isinstance(starting_point, list) and max_iter > len(
+            starting_point
+        ):  # If the number of starting point is larger than max iter, avoid the checking
+            starting_point_len = len(starting_point)
+            starting_point = [
+                x for x in starting_point if self.valid_starting_point(x, search_space)
+            ]
+            if starting_point_len > len(starting_point):
                 logger.warning(
-                    "Starting point {} removed because it is outside of the search space".format(
-                        starting_point
-                    )
+                    "Starting points outside of the search space are removed. "
+                    f"Remaining starting points for {learner_class}: {starting_point}"
                 )
-                starting_point = None
-        elif isinstance(starting_point, list):
-            starting_point = [AutoMLState.sanitize(x) for x in starting_point]
-            if max_iter > len(starting_point):
-                # If the number of starting points is no smaller than max iter, avoid the checking
-                starting_point_len = len(starting_point)
-                starting_point = [
-                    x
-                    for x in starting_point
-                    if self.valid_starting_point(x, search_space)
-                ]
-                if starting_point_len > len(starting_point):
-                    logger.warning(
-                        "Starting points outside of the search space are removed. "
-                        f"Remaining starting points for {learner_class}: {starting_point}"
-                    )
-                starting_point = starting_point or None
+            starting_point = starting_point or None
 
         for name, space in search_space.items():
             assert (
@@ -253,10 +238,7 @@ def update(self, result, time_used):
                 and trained_estimator.params.get(trained_estimator.ITER_HP)
             )
             if n_iter:
-                if "ml" in config:
-                    config["ml"][trained_estimator.ITER_HP] = n_iter
-                else:
-                    config[trained_estimator.ITER_HP] = n_iter
+                config[trained_estimator.ITER_HP] = n_iter
         else:
             obj, time2eval, trained_estimator = np.inf, 0.0, None
             metric_for_logging = config = None
@@ -343,7 +325,7 @@ def _prepare_sample_train_data(self, sample_size):
         return sampled_X_train, sampled_y_train, sampled_weight, groups
 
     @staticmethod
-    def _compute_with_config_base(config_w_resource, state, estimator, is_report=True):
+    def _compute_with_config_base(config_w_resource, state, estimator):
         if "FLAML_sample_size" in config_w_resource:
             sample_size = int(config_w_resource["FLAML_sample_size"])
         else:
@@ -419,17 +401,16 @@ def _compute_with_config_base(config_w_resource, state, estimator, is_report=Tru
         }
         if sampled_weight is not None:
             this_estimator_kwargs["sample_weight"] = weight
-        if is_report is True:
-            tune.report(**result)
+        tune.report(**result)
         return result
 
-    @classmethod
-    def sanitize(cls, config: dict) -> dict:
+    def sanitize(self, config: dict) -> dict:
         """Make a config ready for passing to estimator."""
         config = config.get("ml", config).copy()
-        config.pop("FLAML_sample_size", None)
-        config.pop("learner", None)
-        config.pop("_choice_", None)
+        if "FLAML_sample_size" in config:
+            del config["FLAML_sample_size"]
+        if "learner" in config:
+            del config["learner"]
         return config
 
     def _train_with_config(
@@ -442,7 +423,7 @@ def _train_with_config(
             sample_size = config_w_resource.get(
                 "FLAML_sample_size", len(self.y_train_all)
             )
-        config = AutoMLState.sanitize(config_w_resource)
+        config = self.sanitize(config_w_resource)
 
         this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
             estimator
@@ -661,10 +642,7 @@ def custom_metric(
             n_concurrent_trials: [Experimental] int, default=1 | The number of
                 concurrent trials. When n_concurrent_trials > 1, flaml performes
                 [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)
-                and installation of ray or spark is required: `pip install flaml[ray]`
-                or `pip install flaml[spark]`. Please check
-                [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
-                for more details about installing Spark.
+                and installation of ray is required: `pip install flaml[ray]`.
             keep_search_state: boolean, default=False | Whether to keep data needed
                 for model search after fit(). By default the state is deleted for
                 space saving.
@@ -684,15 +662,6 @@ def custom_metric(
                 datasets, but will incur more overhead in time.
                 If dict: the dict contains the keywords arguments to be passed to
                 [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).
-            use_spark: boolean, default=False | Whether to use spark to run the training
-                in parallel spark jobs. This can be used to accelerate training on large models
-                and large datasets, but will incur more overhead in time and thus slow down
-                training in some cases. GPU training is not supported yet when use_spark is True.
-                For Spark clusters, by default, we will launch one trial per executor. However,
-                sometimes we want to launch more trials than the number of executors (e.g., local mode).
-                In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override
-                the detected `num_executors`. The final number of concurrent trials will be the minimum
-                of `n_concurrent_trials` and `num_executors`.
             free_mem_ratio: float between 0 and 1, default=0. The free memory ratio to keep during training.
             metric_constraints: list, default=[] | The list of metric constraints.
                 Each element in this list is a 3-tuple, which shall be expressed
@@ -784,9 +753,6 @@ def custom_metric(
         settings["append_log"] = settings.get("append_log", False)
         settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN)
         settings["use_ray"] = settings.get("use_ray", False)
-        settings["use_spark"] = settings.get("use_spark", False)
-        if settings["use_ray"] is not False and settings["use_spark"] is not False:
-            raise ValueError("use_ray and use_spark cannot be both True.")
         settings["free_mem_ratio"] = settings.get("free_mem_ratio", 0)
         settings["metric_constraints"] = settings.get("metric_constraints", [])
         settings["cv_score_agg_func"] = settings.get("cv_score_agg_func", None)
@@ -848,15 +814,13 @@ def best_iteration(self):
     def best_config(self):
         """A dictionary of the best configuration."""
         state = self._search_states.get(self._best_estimator)
-        config = state and getattr(state, "best_config", None)
-        return config and AutoMLState.sanitize(config)
+        return state and getattr(state, "best_config", None)
 
     @property
     def best_config_per_estimator(self):
         """A dictionary of all estimators' best configuration."""
         return {
             e: e_search_state.best_config
-            and AutoMLState.sanitize(e_search_state.best_config)
             for e, e_search_state in self._search_states.items()
         }
 
@@ -1605,7 +1569,7 @@ def get_estimator_from_log(self, log_file_name, record_id, task):
         with training_log_reader(log_file_name) as reader:
             record = reader.get_record(record_id)
             estimator = record.learner
-            config = AutoMLState.sanitize(record.config)
+            config = record.config
 
         estimator, _ = train_estimator(
             X_train=None,
@@ -2109,10 +2073,8 @@ def trainable(self) -> Callable[[dict], Optional[float]]:
         states = self._search_states
         mem_res = self._mem_thres
 
-        def train(config: dict, state, is_report=True):
-            # handle spark broadcast variables
-            state = get_broadcast_data(state)
-            is_report = get_broadcast_data(is_report)
+        def train(config: dict, state):
+
             sample_size = config.get("FLAML_sample_size")
             config = config.get("ml", config).copy()
             if sample_size:
@@ -2121,9 +2083,8 @@ def train(config: dict, state, is_report=True):
             # check memory constraints before training
             if states[estimator].learner_class.size(config) <= mem_res:
                 del config["learner"]
-                config.pop("_choice_", None)
                 result = AutoMLState._compute_with_config_base(
-                    config, state=state, estimator=estimator, is_report=is_report
+                    config, state=state, estimator=estimator
                 )
             else:
                 # If search algorithm is not in flaml, it does not handle the config constraint, should also tune.report before return
@@ -2134,8 +2095,7 @@ def train(config: dict, state, is_report=True):
                     "val_loss": np.inf,
                     "trained_estimator": None,
                 }
-            if is_report is True:
-                tune.report(**result)
+            tune.report(**result)
             return result
 
         if self._use_ray is not False:
@@ -2145,10 +2105,6 @@ def train(config: dict, state, is_report=True):
                 train,
                 state=self._state,
             )
-        elif self._use_spark:
-            from flaml.tune.spark.utils import with_parameters
-
-            return with_parameters(train, state=self._state, is_report=False)
         else:
             return partial(
                 train,
@@ -2209,7 +2165,6 @@ def fit(
         auto_augment=None,
         min_sample_size=None,
         use_ray=None,
-        use_spark=None,
         free_mem_ratio=0,
         metric_constraints=None,
         custom_hp=None,
@@ -2383,10 +2338,7 @@ def custom_metric(
             n_concurrent_trials: [Experimental] int, default=1 | The number of
                 concurrent trials. When n_concurrent_trials > 1, flaml performes
                 [parallel tuning](../../Use-Cases/Task-Oriented-AutoML#parallel-tuning)
-                and installation of ray or spark is required: `pip install flaml[ray]`
-                or `pip install flaml[spark]`. Please check
-                [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
-                for more details about installing Spark.
+                and installation of ray is required: `pip install flaml[ray]`.
             keep_search_state: boolean, default=False | Whether to keep data needed
                 for model search after fit(). By default the state is deleted for
                 space saving.
@@ -2406,10 +2358,6 @@ def custom_metric(
                 datasets, but will incur more overhead in time.
                 If dict: the dict contains the keywords arguments to be passed to
                 [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).
-            use_spark: boolean, default=False | Whether to use spark to run the training
-                in parallel spark jobs. This can be used to accelerate training on large models
-                and large datasets, but will incur more overhead in time and thus slow down
-                training in some cases.
             free_mem_ratio: float between 0 and 1, default=0. The free memory ratio to keep during training.
             metric_constraints: list, default=[] | The list of metric constraints.
                 Each element in this list is a 3-tuple, which shall be expressed
@@ -2603,50 +2551,12 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
         )
         min_sample_size = min_sample_size or self._settings.get("min_sample_size")
         use_ray = self._settings.get("use_ray") if use_ray is None else use_ray
-        use_spark = self._settings.get("use_spark") if use_spark is None else use_spark
-        spark_available, spark_error_msg = check_spark()
-        if use_spark and use_ray is not False:
-            raise ValueError("use_spark and use_ray cannot be both True.")
-        elif use_spark and not spark_available:
-            raise spark_error_msg
-
-        old_level = logger.getEffectiveLevel()
-        self.verbose = verbose
-        logger.setLevel(50 - verbose * 10)
-        if not logger.handlers:
-            # Add the console handler.
-            _ch = logging.StreamHandler(stream=sys.stdout)
-            _ch.setFormatter(logger_formatter)
-            logger.addHandler(_ch)
-
-        if not use_ray and not use_spark and n_concurrent_trials > 1:
-            if ray_available:
-                logger.warning(
-                    "n_concurrent_trials > 1 is only supported when using Ray or Spark. "
-                    "Ray installed, setting use_ray to True. If you want to use Spark, set use_spark to True."
-                )
-                use_ray = True
-            elif spark_available:
-                logger.warning(
-                    "n_concurrent_trials > 1 is only supported when using Ray or Spark. "
-                    "Spark installed, setting use_spark to True. If you want to use Ray, set use_ray to True."
-                )
-                use_spark = True
-            else:
-                logger.warning(
-                    "n_concurrent_trials > 1 is only supported when using Ray or Spark. "
-                    "Neither Ray nor Spark installed, setting n_concurrent_trials to 1."
-                )
-                n_concurrent_trials = 1
-
         self._state.n_jobs = n_jobs
         self._n_concurrent_trials = n_concurrent_trials
         self._early_stop = early_stop
-        self._use_spark = use_spark
-        self._use_ray = use_ray
+        self._use_ray = use_ray or n_concurrent_trials > 1
         # use the following condition if we have an estimation of average_trial_time and average_trial_overhead
-        # self._use_ray = use_ray or n_concurrent_trials > ( average_trial_time + average_trial_overhead) / (average_trial_time)
-
+        # self._use_ray = use_ray or n_concurrent_trials > ( average_trail_time + average_trial_overhead) / (average_trial_time)
         if self._use_ray is not False:
             import ray
 
@@ -2674,11 +2584,6 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
                 X_train = ray.get(X_train)
             elif isinstance(dataframe, ray.ObjectRef):
                 dataframe = ray.get(dataframe)
-        else:
-            # TODO: Integrate with Spark
-            self._state.resources_per_trial = (
-                {"cpu": n_jobs} if n_jobs > 0 else {"cpu": 1}
-            )
         self._state.free_mem_ratio = (
             self._settings.get("free_mem_ratio")
             if free_mem_ratio is None
@@ -2709,6 +2614,14 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
         self._random = np.random.RandomState(RANDOM_SEED)
         self._seed = seed if seed is not None else 20
         self._learner_selector = learner_selector
+        old_level = logger.getEffectiveLevel()
+        self.verbose = verbose
+        logger.setLevel(50 - verbose * 10)
+        if not logger.handlers:
+            # Add the console handler.
+            _ch = logging.StreamHandler()
+            _ch.setFormatter(logger_formatter)
+            logger.addHandler(_ch)
         logger.info(f"task = {task}")
         self._decide_split_type(split_type)
         logger.info(f"Data split method: {self._split_type}")
@@ -2894,7 +2807,6 @@ def is_to_reverse_metric(metric, task):
             logger.warning(
                 "No search budget is provided via time_budget or max_iter."
                 " Training only one model per estimator."
-                " Zero-shot AutoML is used for certain tasks and estimators."
                 " To tune hyperparameters for each estimator,"
                 " please provide budget either via time_budget or max_iter."
             )
@@ -3004,7 +2916,7 @@ def is_to_reverse_metric(metric, task):
             else (
                 "bs"
                 if n_concurrent_trials > 1
-                or (self._use_ray is not False or self._use_spark)
+                or self._use_ray is not False
                 and len(estimator_list) > 1
                 else "cfo"
             )
@@ -3101,24 +3013,20 @@ def visualize(
             plt.savefig("{}".format(plot_filename))
 
     def _search_parallel(self):
-        if self._use_ray is not False:
-            try:
-                from ray import __version__ as ray_version
-
-                assert ray_version >= "1.10.0"
-                if ray_version.startswith("1."):
-                    from ray.tune.suggest import ConcurrencyLimiter
-                else:
-                    from ray.tune.search import ConcurrencyLimiter
-                import ray
-            except (ImportError, AssertionError):
-                raise ImportError(
-                    "use_ray=True requires installation of ray. "
-                    "Please run pip install flaml[ray]"
-                )
-        else:
-            from flaml.tune.searcher.suggestion import ConcurrencyLimiter
+        try:
+            from ray import __version__ as ray_version
 
+            assert ray_version >= "1.10.0"
+            if ray_version.startswith("1."):
+                from ray.tune.suggest import ConcurrencyLimiter
+            else:
+                from ray.tune.search import ConcurrencyLimiter
+            import ray
+        except (ImportError, AssertionError):
+            raise ImportError(
+                "n_concurrent_trial>1 or use_ray=True requires installation of ray. "
+                "Please run pip install flaml[ray]"
+            )
         if self._hpo_method in ("cfo", "grid"):
             from flaml import CFO as SearchAlgo
         elif "bs" == self._hpo_method:
@@ -3126,20 +3034,15 @@ def _search_parallel(self):
         elif "random" == self._hpo_method:
             from flaml import RandomSearch as SearchAlgo
         elif "optuna" == self._hpo_method:
-            if self._use_ray is not False:
-                try:
-                    from ray import __version__ as ray_version
+            try:
+                from ray import __version__ as ray_version
 
-                    assert ray_version >= "1.10.0"
-                    if ray_version.startswith("1."):
-                        from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
-                    else:
-                        from ray.tune.search.optuna import OptunaSearch as SearchAlgo
-                except (ImportError, AssertionError):
-                    from flaml.tune.searcher.suggestion import (
-                        OptunaSearch as SearchAlgo,
-                    )
-            else:
+                assert ray_version >= "1.10.0"
+                if ray_version.startswith("1."):
+                    from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
+                else:
+                    from ray.tune.search.optuna import OptunaSearch as SearchAlgo
+            except (ImportError, AssertionError):
                 from flaml.tune.searcher.suggestion import OptunaSearch as SearchAlgo
         else:
             raise NotImplementedError(
@@ -3183,7 +3086,7 @@ def _search_parallel(self):
                 allow_empty_config=True,
             )
         else:
-            # if self._hpo_method is optuna, sometimes the search space and the initial config dimension do not match
+            # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
             # need to remove the extra keys from the search space to be consistent with the initial config
             converted_space = SearchAlgo.convert_search_space(space)
 
@@ -3205,40 +3108,21 @@ def _search_parallel(self):
         search_alg = ConcurrencyLimiter(search_alg, self._n_concurrent_trials)
         resources_per_trial = self._state.resources_per_trial
 
-        if self._use_spark:
-            # use spark as parallel backend
-            analysis = tune.run(
-                self.trainable,
-                search_alg=search_alg,
-                config=space,
-                metric="val_loss",
-                mode="min",
-                time_budget_s=time_budget_s,
-                num_samples=self._max_iter,
-                verbose=max(self.verbose - 2, 0),
-                use_ray=False,
-                use_spark=True,
-                # raise_on_failed_trial=False,
-                # keep_checkpoints_num=1,
-                # checkpoint_score_attr="min-val_loss",
-            )
-        else:
-            # use ray as parallel backend
-            analysis = ray.tune.run(
-                self.trainable,
-                search_alg=search_alg,
-                config=space,
-                metric="val_loss",
-                mode="min",
-                resources_per_trial=resources_per_trial,
-                time_budget_s=time_budget_s,
-                num_samples=self._max_iter,
-                verbose=max(self.verbose - 2, 0),
-                raise_on_failed_trial=False,
-                keep_checkpoints_num=1,
-                checkpoint_score_attr="min-val_loss",
-                **self._use_ray if isinstance(self._use_ray, dict) else {},
-            )
+        analysis = ray.tune.run(
+            self.trainable,
+            search_alg=search_alg,
+            config=space,
+            metric="val_loss",
+            mode="min",
+            resources_per_trial=resources_per_trial,
+            time_budget_s=time_budget_s,
+            num_samples=self._max_iter,
+            verbose=max(self.verbose - 2, 0),
+            raise_on_failed_trial=False,
+            keep_checkpoints_num=1,
+            checkpoint_score_attr="min-val_loss",
+            **self._use_ray if isinstance(self._use_ray, dict) else {},
+        )
         # logger.info([trial.last_result for trial in analysis.trials])
         trials = sorted(
             (
@@ -3442,7 +3326,7 @@ def _search_sequential(self):
                         num_samples=self._max_iter,
                     )
                 else:
-                    # if self._hpo_method is optuna, sometimes the search space and the initial config dimension do not match
+                    # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
                     # need to remove the extra keys from the search space to be consistent with the initial config
                     converted_space = SearchAlgo.convert_search_space(search_space)
                     removed_keys = set(search_space.keys()).difference(
@@ -3481,7 +3365,6 @@ def _search_sequential(self):
                 time_budget_s=time_budget_s,
                 verbose=max(self.verbose - 3, 0),
                 use_ray=False,
-                use_spark=False,
             )
             time_used = time.time() - start_run_time
             better = False
@@ -3652,7 +3535,7 @@ def _search(self):
             self._selected = state = self._search_states[estimator]
             state.best_config_sample_size = self._state.data_size[0]
             state.best_config = state.init_config[0] if state.init_config else {}
-        elif self._use_ray is False and self._use_spark is False:
+        elif self._use_ray is False:
             self._search_sequential()
         else:
             self._search_parallel()
@@ -3683,7 +3566,7 @@ def _search(self):
                         x[1].learner_class(
                             task=self._state.task,
                             n_jobs=self._state.n_jobs,
-                            **AutoMLState.sanitize(x[1].best_config),
+                            **self._state.sanitize(x[1].best_config),
                         ),
                     )
                     for x in search_states[:2]
@@ -3694,7 +3577,7 @@ def _search(self):
                         x[1].learner_class(
                             task=self._state.task,
                             n_jobs=self._state.n_jobs,
-                            **AutoMLState.sanitize(x[1].best_config),
+                            **self._state.sanitize(x[1].best_config),
                         ),
                     )
                     for x in search_states[2:]
@@ -3716,10 +3599,6 @@ def _search(self):
                         and ray.available_resources()["CPU"]
                         or os.cpu_count()
                     )
-                elif self._use_spark:
-                    from flaml.tune.spark.utils import get_n_cpus
-
-                    n_cpus = get_n_cpus()
                 else:
                     n_cpus = os.cpu_count()
                 ensemble_n_jobs = (
diff --git a/flaml/tune/searcher/blendsearch.py b/flaml/tune/searcher/blendsearch.py
index f23d35c226..e1227aa772 100644
--- a/flaml/tune/searcher/blendsearch.py
+++ b/flaml/tune/searcher/blendsearch.py
@@ -124,7 +124,7 @@ def __init__(
                 objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives.
                 - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
                 metric names (provided in "metric"), and the values are the numerical target values.
-                - "tolerances" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+                - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
                 metric names (provided in "metrics"), and the values are the numerical tolerances values.
                 E.g.,
                 ```python
@@ -653,11 +653,10 @@ def _expand_admissible_region(self, lower, upper, space):
         for key in upper:
             ub = upper[key]
             if isinstance(ub, list):
-                choice = space[key].get("_choice_")
-                if choice:
-                    self._expand_admissible_region(
-                        lower[key][choice], upper[key][choice], space[key]
-                    )
+                choice = space[key]["_choice_"]
+                self._expand_admissible_region(
+                    lower[key][choice], upper[key][choice], space[key]
+                )
             elif isinstance(ub, dict):
                 self._expand_admissible_region(lower[key], ub, space[key])
             else:
diff --git a/flaml/tune/searcher/flow2.py b/flaml/tune/searcher/flow2.py
index ce097ba0b2..799660e63f 100644
--- a/flaml/tune/searcher/flow2.py
+++ b/flaml/tune/searcher/flow2.py
@@ -80,7 +80,7 @@ def __init__(
                 objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives
                 - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
                 metric names (provided in "metric"), and the values are the numerical target values.
-                - "tolerances" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+                - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
                 metric names (provided in "metrics"), and the values are the numerical tolerances values.
                 E.g.,
                 ```python
diff --git a/flaml/tune/spark/__init__.py b/flaml/tune/spark/__init__.py
deleted file mode 100644
index 873af1534c..0000000000
--- a/flaml/tune/spark/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from flaml.tune.spark.utils import (
-    check_spark,
-    get_n_cpus,
-    with_parameters,
-    broadcast_code,
-)
-
-__all__ = ["check_spark", "get_n_cpus", "with_parameters", "broadcast_code"]
diff --git a/flaml/tune/spark/utils.py b/flaml/tune/spark/utils.py
deleted file mode 100644
index 03337c59b6..0000000000
--- a/flaml/tune/spark/utils.py
+++ /dev/null
@@ -1,191 +0,0 @@
-import os
-import logging
-from functools import partial, lru_cache
-import textwrap
-
-logger = logging.getLogger(__name__)
-logger_formatter = logging.Formatter(
-    "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
-)
-
-try:
-    from pyspark.sql import SparkSession
-    from pyspark.util import VersionUtils
-    import pyspark
-
-    _have_spark = True
-    _spark_major_minor_version = VersionUtils.majorMinorVersion(pyspark.__version__)
-except ImportError as e:
-    logger.debug("Could not import pyspark: %s", e)
-    _have_spark = False
-    _spark_major_minor_version = (0, 0)
-
-
-@lru_cache(maxsize=2)
-def check_spark():
-    """Check if Spark is installed and running.
-    Result of the function will be cached since test once is enough. As lru_cache will not
-    cache exceptions, we don't raise exceptions here but only log a warning message.
-
-    Returns:
-        Return (True, None) if the check passes, otherwise log the exception message and
-        return (False, Exception(msg)). The exception can be raised by the caller.
-    """
-    logger.warning("\ncheck Spark installation...This line should appear only once.\n")
-    if not _have_spark:
-        msg = """use_spark=True requires installation of PySpark. Please run pip install flaml[spark]
-        and check [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
-        for more details about installing Spark."""
-        logger.warning(msg)
-        return False, ImportError(msg)
-
-    if _spark_major_minor_version[0] < 3:
-        msg = "Spark version must be >= 3.0 to use flaml[spark]"
-        logger.warning(msg)
-        return False, ImportError(msg)
-
-    try:
-        SparkSession.builder.getOrCreate()
-    except RuntimeError as e:
-        logger.warning(f"\nSparkSession is not available: {e}\n")
-        return False, RuntimeError(e)
-
-    return True, None
-
-
-def get_n_cpus(node="driver"):
-    """Get the number of CPU cores of the given type of node.
-
-    Args:
-        node: string | The type of node to get the number of cores. Can be 'driver' or 'executor'.
-            Default is 'driver'.
-
-    Returns:
-        An int of the number of CPU cores.
-    """
-    assert node in ["driver", "executor"]
-    try:
-        n_cpus = int(
-            SparkSession.builder.getOrCreate()
-            .sparkContext.getConf()
-            .get(f"spark.{node}.cores")
-        )
-    except (TypeError, RuntimeError):
-        n_cpus = os.cpu_count()
-    return n_cpus
-
-
-def with_parameters(trainable, **kwargs):
-    """Wrapper for trainables to pass arbitrary large data objects.
-
-    This wrapper function will store all passed parameters in the Spark
-    Broadcast variable.
-
-    Args:
-        trainable: Trainable to wrap.
-        **kwargs: parameters to store in object store.
-
-    Returns:
-        A new function with partial application of the given arguments
-        and keywords. The given arguments and keywords will be broadcasted
-        to all the executors.
-
-
-    ```python
-    import pyspark
-    import flaml
-    from sklearn.datasets import load_iris
-    def train(config, data=None):
-        if isinstance(data, pyspark.broadcast.Broadcast):
-            data = data.value
-        print(config, data)
-
-    data = load_iris()
-    with_parameters_train = flaml.tune.spark.utils.with_parameters(train, data=data)
-    with_parameters_train(config=1)
-    train(config={"metric": "accuracy"})
-    ```
-    """
-
-    if not callable(trainable):
-        raise ValueError(
-            f"`with_parameters() only works with function trainables`. "
-            f"Got type: "
-            f"{type(trainable)}."
-        )
-
-    spark_available, spark_error_msg = check_spark()
-    if not spark_available:
-        raise spark_error_msg
-    spark = SparkSession.builder.getOrCreate()
-
-    bc_kwargs = dict()
-    for k, v in kwargs.items():
-        bc_kwargs[k] = spark.sparkContext.broadcast(v)
-
-    return partial(trainable, **bc_kwargs)
-
-
-def broadcast_code(custom_code="", file_name="mylearner"):
-    """Write customized learner/metric code contents to a file for importing.
-    It is necessary for using the customized learner/metric in spark backend.
-    The path of the learner/metric file will be returned.
-
-    Args:
-        custom_code: str, default="" | code contents of the custom learner/metric.
-        file_name: str, default="mylearner" | file name of the custom learner/metric.
-
-    Returns:
-        The path of the custom code file.
-    ```python
-    from flaml.tune.spark.utils import broadcast_code
-    from flaml.automl.model import LGBMEstimator
-
-    custom_code = '''
-    from flaml.automl.model import LGBMEstimator
-    from flaml import tune
-
-    class MyLargeLGBM(LGBMEstimator):
-        @classmethod
-        def search_space(cls, **params):
-            return {
-                "n_estimators": {
-                    "domain": tune.lograndint(lower=4, upper=32768),
-                    "init_value": 32768,
-                    "low_cost_init_value": 4,
-                },
-                "num_leaves": {
-                    "domain": tune.lograndint(lower=4, upper=32768),
-                    "init_value": 32768,
-                    "low_cost_init_value": 4,
-                },
-            }
-    '''
-
-    broadcast_code(custom_code=custom_code)
-    from flaml.tune.spark.mylearner import MyLargeLGBM
-    assert isinstance(MyLargeLGBM(), LGBMEstimator)
-    ```
-    """
-    flaml_path = os.path.dirname(os.path.abspath(__file__))
-    custom_code = textwrap.dedent(custom_code)
-    custom_path = os.path.join(flaml_path, file_name + ".py")
-
-    with open(custom_path, "w") as f:
-        f.write(custom_code)
-
-    return custom_path
-
-
-def get_broadcast_data(broadcast_data):
-    """Get the broadcast data from the broadcast variable.
-
-    Args:
-        broadcast_data: pyspark.broadcast.Broadcast | the broadcast variable.
-
-    Returns:
-        The broadcast data.
-    """
-    if _have_spark and isinstance(broadcast_data, pyspark.broadcast.Broadcast):
-        broadcast_data = broadcast_data.value
-    return broadcast_data
diff --git a/flaml/tune/trial_runner.py b/flaml/tune/trial_runner.py
index 8fe8185a73..6aa2bcd5bc 100644
--- a/flaml/tune/trial_runner.py
+++ b/flaml/tune/trial_runner.py
@@ -135,41 +135,3 @@ def step(self) -> Trial:
     def stop_trial(self, trial):
         super().stop_trial(trial)
         self.running_trial = None
-
-
-class SparkTrialRunner(BaseTrialRunner):
-    """Implementation of the spark trial runner."""
-
-    def __init__(
-        self,
-        search_alg=None,
-        scheduler=None,
-        metric: Optional[str] = None,
-        mode: Optional[str] = "min",
-    ):
-        super().__init__(search_alg, scheduler, metric, mode)
-        self.running_trials = []
-
-    def step(self) -> Trial:
-        """Runs one step of the trial event loop.
-
-        Callers should typically run this method repeatedly in a loop. They
-        may inspect or modify the runner's state in between calls to step().
-
-        Returns:
-            a trial to run.
-        """
-        trial_id = Trial.generate_id()
-        config = self._search_alg.suggest(trial_id)
-        if config is not None:
-            trial = SimpleTrial(config, trial_id)
-            self.add_trial(trial)
-            trial.set_status(Trial.RUNNING)
-            self.running_trials.append(trial)
-        else:
-            trial = None
-        return trial
-
-    def stop_trial(self, trial):
-        super().stop_trial(trial)
-        self.running_trials.remove(trial)
diff --git a/flaml/tune/tune.py b/flaml/tune/tune.py
index 7b5b2a62fe..bc2e11ada5 100644
--- a/flaml/tune/tune.py
+++ b/flaml/tune/tune.py
@@ -7,7 +7,6 @@
 import datetime
 import time
 import os
-import sys
 from collections import defaultdict
 
 try:
@@ -16,9 +15,9 @@
     assert ray_version >= "1.10.0"
     from ray.tune.analysis import ExperimentAnalysis as EA
 
-    ray_available = True
+    ray_import = True
 except (ImportError, AssertionError):
-    ray_available = False
+    ray_import = False
     from .analysis import ExperimentAnalysis as EA
 
 from .trial import Trial
@@ -26,7 +25,6 @@
 import logging
 
 logger = logging.getLogger(__name__)
-logger.propagate = False
 _use_ray = True
 _runner = None
 _verbose = 0
@@ -228,7 +226,6 @@ def run(
     metric_constraints: Optional[List[Tuple[str, str, float]]] = None,
     max_failure: Optional[int] = 100,
     use_ray: Optional[bool] = False,
-    use_spark: Optional[bool] = False,
     use_incumbent_result_in_evaluation: Optional[bool] = None,
     log_file_name: Optional[str] = None,
     lexico_objectives: Optional[dict] = None,
@@ -362,10 +359,9 @@ def easy_objective(config):
         print(analysis.trials[-1].last_result)
     ```
 
-        verbose: 0, 1, 2, or 3. If ray or spark backend is used, their verbosity will be
-            affected by this argument. 0 = silent, 1 = only status updates,
-            2 = status and brief trial results, 3 = status and detailed trial results.
-            Defaults to 2.
+        verbose: 0, 1, 2, or 3. Verbosity mode for ray if ray backend is used.
+            0 = silent, 1 = only status updates, 2 = status and brief trial
+            results, 3 = status and detailed trial results. Defaults to 2.
         local_dir: A string of the local dir to save ray logs if ray backend is
             used; or a local dir to save the tuning log.
         num_samples: An integer of the number of configs to try. Defaults to 1.
@@ -384,7 +380,6 @@ def easy_objective(config):
         max_failure: int | the maximal consecutive number of failures to sample
             a trial before the tuning is terminated.
         use_ray: A boolean of whether to use ray as the backend.
-        use_spark: A boolean of whether to use spark as the backend.
         log_file_name: A string of the log file name. Default to None.
             When set to None:
                 if local_dir is not given, no log file is created;
@@ -401,17 +396,17 @@ def easy_objective(config):
             objectives in the metric list. If not provided, we use "min" as the default mode for all the objectives.
             - "targets" (optional): a dictionary to specify the optimization targets on the objectives. The keys are the
             metric names (provided in "metric"), and the values are the numerical target values.
-            - "tolerances" (optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
+            - "tolerances"(optional): a dictionary to specify the optimality tolerances on objectives. The keys are the
             metric names (provided in "metrics"), and the values are the numerical tolerances values.
             E.g.,
-    ```python
-    lexico_objectives = {
-        "metrics": ["error_rate", "pred_time"],
-        "modes": ["min", "min"],
-        "tolerances": {"error_rate": 0.01, "pred_time": 0.0},
-        "targets": {"error_rate": 0.0},
-    }
-    ```
+            ```python
+            lexico_objectives = {
+                "metrics": ["error_rate", "pred_time"],
+                "modes": ["min", "min"],
+                "tolerances": {"error_rate": 0.01, "pred_time": 0.0},
+                "targets": {"error_rate": 0.0},
+            }
+            ```
         **ray_args: keyword arguments to pass to ray.tune.run().
             Only valid when use_ray=True.
     """
@@ -428,10 +423,7 @@ def easy_objective(config):
         log_file_name = os.path.join(
             local_dir, "tune_" + str(datetime.datetime.now()).replace(":", "-") + ".log"
         )
-    if use_ray and use_spark:
-        raise ValueError("use_ray and use_spark cannot be both True.")
     if not use_ray:
-        _use_ray = False
         _verbose = verbose
         old_handlers = logger.handlers
         old_level = logger.getEffectiveLevel()
@@ -451,7 +443,7 @@ def easy_objective(config):
                 logger.addHandler(logging.FileHandler(log_file_name))
             elif not logger.hasHandlers():
                 # Add the console handler.
-                _ch = logging.StreamHandler(stream=sys.stdout)
+                _ch = logging.StreamHandler()
                 logger_formatter = logging.Formatter(
                     "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s",
                     "%m-%d %H:%M:%S",
@@ -531,7 +523,7 @@ def easy_objective(config):
         if metric is None or mode is None:
             metric = metric or search_alg.metric or DEFAULT_METRIC
             mode = mode or search_alg.mode
-        if ray_available and use_ray:
+        if ray_import:
             if ray_version.startswith("1."):
                 from ray.tune.suggest import ConcurrencyLimiter
             else:
@@ -575,7 +567,7 @@ def easy_objective(config):
             params["grace_period"] = min_resource
         if reduction_factor:
             params["reduction_factor"] = reduction_factor
-        if ray_available:
+        if ray_import:
             from ray.tune.schedulers import ASHAScheduler
 
             scheduler = ASHAScheduler(**params)
@@ -613,142 +605,6 @@ def easy_objective(config):
             _running_trial = old_running_trial
             _training_iteration = old_training_iteration
 
-    if use_spark:
-        # parallel run with spark
-        from flaml.tune.spark.utils import check_spark
-
-        spark_available, spark_error_msg = check_spark()
-        if not spark_available:
-            raise spark_error_msg
-        try:
-            from pyspark.sql import SparkSession
-            from joblib import Parallel, delayed, parallel_backend
-            from joblibspark import register_spark
-        except ImportError as e:
-            raise ImportError(
-                f"{e}. Try pip install flaml[spark] or set use_spark=False."
-            )
-        from flaml.tune.searcher.suggestion import ConcurrencyLimiter
-        from .trial_runner import SparkTrialRunner
-
-        register_spark()
-        spark = SparkSession.builder.getOrCreate()
-        sc = spark._jsc.sc()
-        num_executors = (
-            len([executor.host() for executor in sc.statusTracker().getExecutorInfos()])
-            - 1
-        )
-        """
-        By default, the number of executors is the number of VMs in the cluster. And we can
-        launch one trial per executor. However, sometimes we can launch more trials than
-        the number of executors (e.g., local mode). In this case, we can set the environment
-        variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`.
-
-        `max_concurrent` is the maximum number of concurrent trials defined by `search_alg`,
-        `FLAML_MAX_CONCURRENT` will also be used to override `max_concurrent` if `search_alg`
-        is not an instance of `ConcurrencyLimiter`.
-
-        The final number of concurrent trials is the minimum of `max_concurrent` and
-        `num_executors`.
-        """
-        num_executors = max(num_executors, int(os.getenv("FLAML_MAX_CONCURRENT", 1)), 1)
-        time_start = time.time()
-        if scheduler:
-            scheduler.set_search_properties(metric=metric, mode=mode)
-        if isinstance(search_alg, ConcurrencyLimiter):
-            max_concurrent = max(1, search_alg.max_concurrent)
-        else:
-            max_concurrent = max(1, int(os.getenv("FLAML_MAX_CONCURRENT", 1)))
-
-        n_concurrent_trials = min(num_executors, max_concurrent)
-        with parallel_backend("spark"):
-            with Parallel(
-                n_jobs=n_concurrent_trials, verbose=max(0, (verbose - 1) * 50)
-            ) as parallel:
-                try:
-                    _runner = SparkTrialRunner(
-                        search_alg=search_alg,
-                        scheduler=scheduler,
-                        metric=metric,
-                        mode=mode,
-                    )
-                    num_trials = 0
-                    if time_budget_s is None:
-                        time_budget_s = np.inf
-                    fail = 0
-                    ub = (
-                        len(evaluated_rewards) if evaluated_rewards else 0
-                    ) + max_failure
-                    while (
-                        time.time() - time_start < time_budget_s
-                        and (num_samples < 0 or num_trials < num_samples)
-                        and fail < ub
-                    ):
-                        while len(_runner.running_trials) < n_concurrent_trials:
-                            # suggest trials for spark
-                            trial_next = _runner.step()
-                            if trial_next:
-                                num_trials += 1
-                            else:
-                                fail += 1  # break with ub consecutive failures
-                                logger.debug(f"consecutive failures is {fail}")
-                                if fail >= ub:
-                                    break
-                        trials_to_run = _runner.running_trials
-                        if not trials_to_run:
-                            logger.warning(
-                                f"fail to sample a trial for {max_failure} times in a row, stopping."
-                            )
-                            break
-                        logger.info(
-                            f"Number of trials: {num_trials}/{num_samples}, {len(_runner.running_trials)} RUNNING,"
-                            f" {len(_runner._trials) - len(_runner.running_trials)} TERMINATED"
-                        )
-                        logger.debug(
-                            f"Configs of Trials to run: {[trial_to_run.config for trial_to_run in trials_to_run]}"
-                        )
-                        results = parallel(
-                            delayed(evaluation_function)(trial_to_run.config)
-                            for trial_to_run in trials_to_run
-                        )
-                        # results = [evaluation_function(trial_to_run.config) for trial_to_run in trials_to_run]
-                        while results:
-                            result = results.pop(0)
-                            trial_to_run = trials_to_run[0]
-                            _runner.running_trial = trial_to_run
-                            if result is not None:
-                                if isinstance(result, dict):
-                                    if result:
-                                        logger.info(f"Brief result: {result}")
-                                        report(**result)
-                                    else:
-                                        # When the result returned is an empty dict, set the trial status to error
-                                        trial_to_run.set_status(Trial.ERROR)
-                                else:
-                                    logger.info(
-                                        "Brief result: {}".format({metric: result})
-                                    )
-                                    report(_metric=result)
-                            _runner.stop_trial(trial_to_run)
-                        fail = 0
-                    analysis = ExperimentAnalysis(
-                        _runner.get_trials(),
-                        metric=metric,
-                        mode=mode,
-                        lexico_objectives=lexico_objectives,
-                    )
-                    return analysis
-                finally:
-                    # recover the global variables in case of nested run
-                    _use_ray = old_use_ray
-                    _verbose = old_verbose
-                    _running_trial = old_running_trial
-                    _training_iteration = old_training_iteration
-                    if not use_ray:
-                        _runner = old_runner
-                        logger.handlers = old_handlers
-                        logger.setLevel(old_level)
-
     # simple sequential run without using tune.run() from ray
     time_start = time.time()
     _use_ray = False
diff --git a/notebook/automl_lightgbm.ipynb b/notebook/automl_lightgbm.ipynb
index 410912cd57..3b76e39c09 100644
--- a/notebook/automl_lightgbm.ipynb
+++ b/notebook/automl_lightgbm.ipynb
@@ -1041,7 +1041,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.8.13 ('syml-py38')",
+   "display_name": "Python 3.9.12 64-bit",
    "language": "python",
    "name": "python3"
   },
@@ -1055,11 +1055,11 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.12"
   },
   "vscode": {
    "interpreter": {
-    "hash": "e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7"
+    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
    }
   }
  },
diff --git a/notebook/integrate_azureml.ipynb b/notebook/integrate_azureml.ipynb
index b7f0694f7f..b34f724fda 100644
--- a/notebook/integrate_azureml.ipynb
+++ b/notebook/integrate_azureml.ipynb
@@ -203,7 +203,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.8.13 ('syml-py38')",
+   "display_name": "Python 3.9.7 ('base')",
    "language": "python",
    "name": "python3"
   },
@@ -217,11 +217,11 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.7"
   },
   "vscode": {
    "interpreter": {
-    "hash": "e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7"
+    "hash": "e811209110f5aa4d8c2189eeb3ff7b9b4d146931cb9189ef6041ff71605c541d"
    }
   }
  },
diff --git a/notebook/integrate_spark.ipynb b/notebook/integrate_spark.ipynb
deleted file mode 100644
index e440787b0a..0000000000
--- a/notebook/integrate_spark.ipynb
+++ /dev/null
@@ -1 +0,0 @@
-{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["Copyright (c) Microsoft Corporation. All rights reserved. \n","\n","Licensed under the MIT License.\n","\n","# Run FLAML Parallel tuning with Spark\n","\n","\n","## 1. Introduction\n","\n","FLAML is a Python library (https://github.com/microsoft/FLAML) designed to automatically produce accurate machine learning models \n","with low computational cost. It is fast and economical. The simple and lightweight design makes it easy \n","to use and extend, such as adding new learners. FLAML can \n","- serve as an economical AutoML engine,\n","- be used as a fast hyperparameter tuning tool, or \n","- be embedded in self-tuning software that requires low latency & resource in repetitive\n","   tuning tasks.\n","\n","In this notebook, we demonstrate how to run FLAML parallel tuning using Spark as the backend.\n","\n","FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the following options:\n","```bash\n","pip install flaml[spark,notebook,blendsearch]>=1.1.0\n","```\n","*Spark support is added in v1.1.0*"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:16:51.6335768Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:17:21.9028602Z\",\"execution_finish_time\":\"2022-12-07T08:18:52.3646576Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["# %pip install flaml[spark,notebook,blendsearch]>=1.1.0"]},{"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["## 2. Regression Example\n","### Load data and preprocess\n","\n","Download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region."]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.4783943Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:20:55.7666047Z\",\"execution_finish_time\":\"2022-12-07T08:21:10.9050139Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"subslide"},"tags":[]},"outputs":[],"source":["from flaml.data import load_openml_dataset\n","X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=537, data_dir='./')"]},{"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["### Run FLAML\n","In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. \n","\n","Notice that here `use_spark` is set to `True` in order to use Spark as the parallel training backend."]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.7001471Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:10.9846131Z\",\"execution_finish_time\":\"2022-12-07T08:21:11.3604062Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' import AutoML class from flaml package '''\n","from flaml import AutoML\n","automl = AutoML()"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:53.8983341Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:11.4417491Z\",\"execution_finish_time\":\"2022-12-07T08:21:11.8242955Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["settings = {\n","    \"time_budget\": 30,  # total running time in seconds\n","    \"metric\": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']\n","    \"estimator_list\": ['lgbm'],  # list of ML learners; we tune lightgbm in this example\n","    \"task\": 'regression',  # task type    \n","    \"log_file_name\": 'houses_experiment.log',  # flaml log file\n","    \"seed\": 7654321,    # random seed\n","    \"use_spark\": True,  # whether to use Spark for distributed training\n","    \"n_concurrent_trials\": 2,  # the maximum number of concurrent trials\n","}"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.3953298Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:21:11.9003975Z\",\"execution_finish_time\":\"2022-12-07T08:27:58.525709Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["'''The main flaml automl API'''\n","automl.fit(X_train=X_train, y_train=y_train, **settings)"]},{"cell_type":"markdown","metadata":{"slideshow":{"slide_type":"slide"}},"source":["### Best model and metric"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.789647Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:58.6014435Z\",\"execution_finish_time\":\"2022-12-07T08:27:58.9745212Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' retrieve best config'''\n","print('Best hyperparmeter config:', automl.best_config)\n","print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))\n","print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:54.9962623Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:59.0491242Z\",\"execution_finish_time\":\"2022-12-07T08:27:59.4076477Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["automl.model.estimator"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.2539877Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:27:59.5247209Z\",\"execution_finish_time\":\"2022-12-07T08:28:00.4849272Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["import matplotlib.pyplot as plt\n","plt.barh(automl.feature_names_in_, automl.feature_importances_)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.5182783Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:00.5644015Z\",\"execution_finish_time\":\"2022-12-07T08:28:01.5531147Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["''' pickle and save the automl object '''\n","import pickle\n","with open('automl.pkl', 'wb') as f:\n","    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:55.803107Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:01.6350567Z\",\"execution_finish_time\":\"2022-12-07T08:28:02.5774117Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' compute predictions of testing dataset ''' \n","y_pred = automl.predict(X_test)\n","print('Predicted labels', y_pred)\n","print('True labels', y_test)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.0585537Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:02.6537337Z\",\"execution_finish_time\":\"2022-12-07T08:28:03.0177805Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"},"tags":[]},"outputs":[],"source":["''' compute different metric values on testing dataset'''\n","from flaml.ml import sklearn_metric_loss_score\n","print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))\n","print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))\n","print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.2226463Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:03.1150781Z\",\"execution_finish_time\":\"2022-12-07T08:28:03.4858362Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"subslide"},"tags":[]},"outputs":[],"source":["from flaml.data import get_output_from_log\n","time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n","    get_output_from_log(filename=settings['log_file_name'], time_budget=60)\n","\n","for config in config_history:\n","    print(config)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T08:20:56.4020235Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T08:28:03.5811012Z\",\"execution_finish_time\":\"2022-12-07T08:28:04.5493292Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","slideshow":{"slide_type":"slide"}},"outputs":[],"source":["import numpy as np\n","\n","plt.title('Learning Curve')\n","plt.xlabel('Wall Clock Time (s)')\n","plt.ylabel('Validation r2')\n","plt.scatter(time_history, 1 - np.array(valid_loss_history))\n","plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["## 3. Add a customized LightGBM learner in FLAML\n","The native API of LightGBM allows one to specify a custom objective function in the model constructor. You can easily enable it by adding a customized LightGBM learner in FLAML. In the following example, we show how to add such a customized LightGBM learner with a custom objective function for parallel tuning with Spark.\n","\n","It's a little bit different from adding customized learners for sequential training. In sequential training, we can define the customized learner in a notebook cell. However, in spark training, we have to import it from a file so that Spark can use it in executors. We can easily do it by leveraging `broadcast_code` function in `flaml.tune.spark.utils`."]},{"cell_type":"markdown","metadata":{},"source":["### Create a customized LightGBM learner with a custom objective function"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:09:49.540914Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:09:49.6259637Z\",\"execution_finish_time\":\"2022-12-07T09:09:50.5841239Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}"},"outputs":[],"source":["custom_code = \"\"\"\n","import numpy as np \n","from flaml.model import LGBMEstimator\n","from flaml import tune\n","\n","\n","''' define your customized objective function '''\n","def my_loss_obj(y_true, y_pred):\n","    c = 0.5\n","    residual = y_pred - y_true\n","    grad = c * residual /(np.abs(residual) + c)\n","    hess = c ** 2 / (np.abs(residual) + c) ** 2\n","    # rmse grad and hess\n","    grad_rmse = residual\n","    hess_rmse = 1.0\n","    \n","    # mae grad and hess\n","    grad_mae = np.array(residual)\n","    grad_mae[grad_mae > 0] = 1.\n","    grad_mae[grad_mae <= 0] = -1.\n","    hess_mae = 1.0\n","\n","    coef = [0.4, 0.3, 0.3]\n","    return coef[0] * grad + coef[1] * grad_rmse + coef[2] * grad_mae, \\\n","        coef[0] * hess + coef[1] * hess_rmse + coef[2] * hess_mae\n","\n","\n","''' create a customized LightGBM learner class with your objective function '''\n","class MyLGBM(LGBMEstimator):\n","    '''LGBMEstimator with my_loss_obj as the objective function\n","    '''\n","\n","    def __init__(self, **config):\n","        super().__init__(objective=my_loss_obj, **config)\n","\"\"\"\n","\n","from flaml.tune.spark.utils import broadcast_code\n","custom_learner_path = broadcast_code(custom_code=custom_code)\n","print(custom_learner_path)\n","from flaml.tune.spark.mylearner import MyLGBM"]},{"cell_type":"markdown","metadata":{},"source":["### Add the customized learner in FLAML"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:14:16.2449566Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:14:16.3227204Z\",\"execution_finish_time\":\"2022-12-07T09:16:49.7573919Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","tags":[]},"outputs":[],"source":["automl = AutoML()\n","automl.add_learner(learner_name='my_lgbm', learner_class=MyLGBM)\n","settings = {\n","    \"time_budget\": 30,  # total running time in seconds\n","    \"metric\": 'r2',  # primary metrics for regression can be chosen from: ['mae','mse','r2']\n","    \"estimator_list\": ['my_lgbm',],  # list of ML learners; we tune lightgbm in this example\n","    \"task\": 'regression',  # task type    \n","    \"log_file_name\": 'houses_experiment_my_lgbm.log',  # flaml log file\n","    \"n_concurrent_trials\": 2,\n","    \"use_spark\": True,\n","}\n","automl.fit(X_train=X_train, y_train=y_train, **settings)"]},{"cell_type":"code","execution_count":null,"metadata":{"cellStatus":"{\"Li Jiang\":{\"queued_time\":\"2022-12-07T09:17:06.0159529Z\",\"session_start_time\":null,\"execution_start_time\":\"2022-12-07T09:17:06.1042554Z\",\"execution_finish_time\":\"2022-12-07T09:17:06.467989Z\",\"state\":\"finished\",\"livy_statement_state\":\"available\"}}","tags":[]},"outputs":[],"source":["print('Best hyperparmeter config:', automl.best_config)\n","print('Best r2 on validation data: {0:.4g}'.format(1-automl.best_loss))\n","print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))\n","\n","y_pred = automl.predict(X_test)\n","print('Predicted labels', y_pred)\n","print('True labels', y_test)\n","\n","from flaml.ml import sklearn_metric_loss_score\n","print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))\n","print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))\n","print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))"]},{"cell_type":"code","execution_count":null,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"nteract":{"transient":{"deleting":false}}},"outputs":[],"source":[]}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"display_name":"Python 3.8.13 ('syml-py38')","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.13 (default, Oct 21 2022, 23:50:54) \n[GCC 11.2.0]"},"notebook_environment":{},"save_output":true,"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.livy.synapse.ipythonInterpreter.enabled":"true"},"enableDebugMode":false,"keepAliveTimeout":30}},"synapse_widget":{"state":{},"version":"0.1"},"trident":{"lakehouse":{}},"vscode":{"interpreter":{"hash":"e3d9487e2ef008ade0db1bc293d3206d35cb2b6081faff9f66b40b257b7398f7"}}},"nbformat":4,"nbformat_minor":0}
diff --git a/notebook/tune_pytorch.ipynb b/notebook/tune_pytorch.ipynb
index 93153ac50c..d90f4fb9c1 100644
--- a/notebook/tune_pytorch.ipynb
+++ b/notebook/tune_pytorch.ipynb
@@ -347,11 +347,7 @@
     "        best_trained_model = nn.DataParallel(best_trained_model)\n",
     "best_trained_model.to(device)\n",
     "\n",
-    "checkpoint_value = (\n",
-    "    getattr(best_trial.checkpoint, \"dir_or_data\", None)\n",
-    "    or best_trial.checkpoint.value\n",
-    ")\n",
-    "checkpoint_path = os.path.join(checkpoint_value, \"checkpoint\")\n",
+    "checkpoint_path = os.path.join(best_trial.checkpoint.value, \"checkpoint\")\n",
     "\n",
     "model_state, optimizer_state = torch.load(checkpoint_path)\n",
     "best_trained_model.load_state_dict(model_state)\n",
@@ -362,9 +358,11 @@
   }
  ],
  "metadata": {
+  "interpreter": {
+   "hash": "f7771e6a3915580179405189f5aa4eb9047494cbe4e005b29b851351b54902f6"
+  },
   "kernelspec": {
-   "display_name": "Python 3.11.0 64-bit",
-   "language": "python",
+   "display_name": "Python 3.8.10 64-bit ('venv': venv)",
    "name": "python3"
   },
   "language_info": {
@@ -377,17 +375,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.0"
+   "version": "3.8.12"
   },
   "metadata": {
    "interpreter": {
     "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
    }
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
-   }
   }
  },
  "nbformat": 4,
diff --git a/setup.py b/setup.py
index b798035c29..3e4dadd62b 100644
--- a/setup.py
+++ b/setup.py
@@ -44,10 +44,6 @@
             "matplotlib",
             "openml==0.10.2",
         ],
-        "spark": [
-            "pyspark>=3.0.0",
-            "joblibspark>=0.5.0",
-        ],
         "test": [
             "flake8>=3.8.4",
             "thop",
@@ -72,10 +68,6 @@
             "matplotlib",
             "pytorch-forecasting>=0.9.0,<=0.10.1",
             "mlflow",
-            "pyspark>=3.0.0",
-            "joblibspark>=0.5.0",
-            "nbconvert",
-            "nbformat",
         ],
         "catboost": ["catboost>=0.26"],
         "blendsearch": ["optuna==2.8.0"],
diff --git a/test/automl/__init__.py b/test/automl/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/test/automl/test_multiclass.py b/test/automl/test_multiclass.py
index b9f7b177a5..f6bb383964 100644
--- a/test/automl/test_multiclass.py
+++ b/test/automl/test_multiclass.py
@@ -146,18 +146,6 @@ def test_custom_learner(self):
         MyRegularizedGreedyForest.search_space = lambda data_size, task: {}
         automl.fit(X_train=X_train, y_train=y_train, **settings)
 
-        try:
-            import ray
-
-            del settings["time_budget"]
-            settings["max_iter"] = 5
-            # test the "_choice_" issue when using ray
-            automl.fit(
-                X_train=X_train, y_train=y_train, n_concurrent_trials=2, **settings
-            )
-        except ImportError:
-            return
-
     def test_ensemble(self):
         automl = AutoML()
         automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
@@ -183,8 +171,8 @@ def test_dataframe(self):
     def test_custom_metric(self):
         df, y = load_iris(return_X_y=True, as_frame=True)
         df["label"] = y
-        automl = AutoML()
-        settings = {
+        automl_experiment = AutoML()
+        automl_settings = {
             "dataframe": df,
             "label": "label",
             "time_budget": 5,
@@ -200,16 +188,16 @@ def test_custom_metric(self):
             "pred_time_limit": 1e-5,
             "ensemble": True,
         }
-        automl.fit(**settings)
-        print(automl.classes_)
-        print(automl.model)
-        print(automl.config_history)
-        print(automl.best_model_for_estimator("rf"))
-        print(automl.best_iteration)
-        print(automl.best_estimator)
-        automl = AutoML()
-        estimator = automl.get_estimator_from_log(
-            settings["log_file_name"], record_id=0, task="multiclass"
+        automl_experiment.fit(**automl_settings)
+        print(automl_experiment.classes_)
+        print(automl_experiment.model)
+        print(automl_experiment.config_history)
+        print(automl_experiment.best_model_for_estimator("rf"))
+        print(automl_experiment.best_iteration)
+        print(automl_experiment.best_estimator)
+        automl_experiment = AutoML()
+        estimator = automl_experiment.get_estimator_from_log(
+            automl_settings["log_file_name"], record_id=0, task="multiclass"
         )
         print(estimator)
         (
@@ -218,20 +206,17 @@ def test_custom_metric(self):
             valid_loss_history,
             config_history,
             metric_history,
-        ) = get_output_from_log(filename=settings["log_file_name"], time_budget=6)
+        ) = get_output_from_log(
+            filename=automl_settings["log_file_name"], time_budget=6
+        )
         print(metric_history)
         try:
             import ray
 
             df = ray.put(df)
-            settings["dataframe"] = df
-            settings["use_ray"] = True
-            del settings["time_budget"]
-            settings["max_iter"] = 2
-            automl.fit(**settings)
-            estimator = automl.get_estimator_from_log(
-                settings["log_file_name"], record_id=1, task="multiclass"
-            )
+            automl_settings["dataframe"] = df
+            automl_settings["use_ray"] = True
+            automl_experiment.fit(**automl_settings)
         except ImportError:
             pass
 
@@ -334,8 +319,8 @@ def test_roc_auc_ovo(self):
         automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
 
     def test_roc_auc_ovr_weighted(self):
-        automl = AutoML()
-        settings = {
+        automl_experiment = AutoML()
+        automl_settings = {
             "time_budget": 1,
             "metric": "roc_auc_ovr_weighted",
             "task": "classification",
@@ -345,7 +330,7 @@ def test_roc_auc_ovr_weighted(self):
             "model_history": True,
         }
         X_train, y_train = load_iris(return_X_y=True)
-        automl.fit(X_train=X_train, y_train=y_train, **settings)
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
 
     def test_roc_auc_ovo_weighted(self):
         automl_experiment = AutoML()
@@ -430,10 +415,10 @@ def test_time_limit(self):
         automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
         print(automl_experiment.model)
 
-    def test_fit_w_starting_point(self, as_frame=True, n_concurrent_trials=1):
-        automl = AutoML()
-        settings = {
-            "max_iter": 3,
+    def test_fit_w_starting_point(self, as_frame=True):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 3,
             "metric": "accuracy",
             "task": "classification",
             "log_file_name": "test/iris.log",
@@ -446,26 +431,21 @@ def test_fit_w_starting_point(self, as_frame=True, n_concurrent_trials=1):
             # test drop column
             X_train.columns = range(X_train.shape[1])
             X_train[X_train.shape[1]] = np.zeros(len(y_train))
-        automl.fit(
-            X_train=X_train,
-            y_train=y_train,
-            n_concurrent_trials=n_concurrent_trials,
-            **settings
-        )
-        automl_val_accuracy = 1.0 - automl.best_loss
-        print("Best ML leaner:", automl.best_estimator)
-        print("Best hyperparmeter config:", automl.best_config)
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        automl_val_accuracy = 1.0 - automl_experiment.best_loss
+        print("Best ML leaner:", automl_experiment.best_estimator)
+        print("Best hyperparmeter config:", automl_experiment.best_config)
         print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
         print(
             "Training duration of best run: {0:.4g} s".format(
-                automl.best_config_train_time
+                automl_experiment.best_config_train_time
             )
         )
 
-        starting_points = automl.best_config_per_estimator
+        starting_points = automl_experiment.best_config_per_estimator
         print("starting_points", starting_points)
-        print("loss of the starting_points", automl.best_loss_per_estimator)
-        settings_resume = {
+        print("loss of the starting_points", automl_experiment.best_loss_per_estimator)
+        automl_settings_resume = {
             "time_budget": 2,
             "metric": "accuracy",
             "task": "classification",
@@ -476,34 +456,27 @@ def test_fit_w_starting_point(self, as_frame=True, n_concurrent_trials=1):
             "log_type": "all",
             "starting_points": starting_points,
         }
-        new_automl = AutoML()
-        new_automl.fit(X_train=X_train, y_train=y_train, **settings_resume)
+        new_automl_experiment = AutoML()
+        new_automl_experiment.fit(
+            X_train=X_train, y_train=y_train, **automl_settings_resume
+        )
 
-        new_automl_val_accuracy = 1.0 - new_automl.best_loss
-        print("Best ML leaner:", new_automl.best_estimator)
-        print("Best hyperparmeter config:", new_automl.best_config)
+        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
+        print("Best ML leaner:", new_automl_experiment.best_estimator)
+        print("Best hyperparmeter config:", new_automl_experiment.best_config)
         print(
             "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
         )
         print(
             "Training duration of best run: {0:.4g} s".format(
-                new_automl.best_config_train_time
+                new_automl_experiment.best_config_train_time
             )
         )
 
-    def test_fit_w_starting_point_2(self, as_frame=True):
-        try:
-            import ray
-
-            self.test_fit_w_starting_points_list(as_frame, 2)
-            self.test_fit_w_starting_point(as_frame, 2)
-        except ImportError:
-            pass
-
-    def test_fit_w_starting_points_list(self, as_frame=True, n_concurrent_trials=1):
-        automl = AutoML()
-        settings = {
-            "max_iter": 3,
+    def test_fit_w_starting_points_list(self, as_frame=True):
+        automl_experiment = AutoML()
+        automl_settings = {
+            "time_budget": 3,
             "metric": "accuracy",
             "task": "classification",
             "log_file_name": "test/iris.log",
@@ -516,24 +489,19 @@ def test_fit_w_starting_points_list(self, as_frame=True, n_concurrent_trials=1):
             # test drop column
             X_train.columns = range(X_train.shape[1])
             X_train[X_train.shape[1]] = np.zeros(len(y_train))
-        automl.fit(
-            X_train=X_train,
-            y_train=y_train,
-            n_concurrent_trials=n_concurrent_trials,
-            **settings
-        )
-        automl_val_accuracy = 1.0 - automl.best_loss
-        print("Best ML leaner:", automl.best_estimator)
-        print("Best hyperparmeter config:", automl.best_config)
+        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
+        automl_val_accuracy = 1.0 - automl_experiment.best_loss
+        print("Best ML leaner:", automl_experiment.best_estimator)
+        print("Best hyperparmeter config:", automl_experiment.best_config)
         print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
         print(
             "Training duration of best run: {0:.4g} s".format(
-                automl.best_config_train_time
+                automl_experiment.best_config_train_time
             )
         )
 
         starting_points = {}
-        log_file_name = settings["log_file_name"]
+        log_file_name = automl_settings["log_file_name"]
         with training_log_reader(log_file_name) as reader:
             sample_size = 1000
             for record in reader.records():
@@ -545,7 +513,7 @@ def test_fit_w_starting_points_list(self, as_frame=True, n_concurrent_trials=1):
                     starting_points[learner] = []
                 starting_points[learner].append(config)
         max_iter = sum([len(s) for k, s in starting_points.items()])
-        settings_resume = {
+        automl_settings_resume = {
             "time_budget": 2,
             "metric": "accuracy",
             "task": "classification",
@@ -558,12 +526,14 @@ def test_fit_w_starting_points_list(self, as_frame=True, n_concurrent_trials=1):
             "starting_points": starting_points,
             "append_log": True,
         }
-        new_automl = AutoML()
-        new_automl.fit(X_train=X_train, y_train=y_train, **settings_resume)
+        new_automl_experiment = AutoML()
+        new_automl_experiment.fit(
+            X_train=X_train, y_train=y_train, **automl_settings_resume
+        )
 
-        new_automl_val_accuracy = 1.0 - new_automl.best_loss
-        # print('Best ML leaner:', new_automl.best_estimator)
-        # print('Best hyperparmeter config:', new_automl.best_config)
+        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
+        # print('Best ML leaner:', new_automl_experiment.best_estimator)
+        # print('Best hyperparmeter config:', new_automl_experiment.best_config)
         print(
             "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
         )
diff --git a/test/automl/test_python_log.py b/test/automl/test_python_log.py
index 5a1e9ec153..3681fcd455 100644
--- a/test/automl/test_python_log.py
+++ b/test/automl/test_python_log.py
@@ -96,7 +96,7 @@ def test_logging_level(self):
             )
             print(min(trial.last_result["val_loss"] for trial in analysis.trials))
             config = analysis.trials[-1].last_result["config"]["ml"]
-            automl._state._train_with_config(config.pop("learner"), config)
+            automl._state._train_with_config(config["learner"], config)
             for _ in range(3):
                 print(
                     search_alg._ls.complete_config(
diff --git a/test/automl/test_training_log.py b/test/automl/test_training_log.py
index d8949e6d12..ff1b426735 100644
--- a/test/automl/test_training_log.py
+++ b/test/automl/test_training_log.py
@@ -40,9 +40,7 @@ def test_training_log(
             if automl.best_estimator:
                 estimator, config = automl.best_estimator, automl.best_config
                 model0 = automl.best_model_for_estimator(estimator)
-                print(model0.params)
-                if "n_estimators" in config:
-                    assert model0.params["n_estimators"] == config["n_estimators"]
+                print(model0.params["n_estimators"], config)
 
                 # train on full data with no time limit
                 automl._state.time_budget = -1
diff --git a/test/spark/__init__.py b/test/spark/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/test/spark/custom_mylearner.py b/test/spark/custom_mylearner.py
deleted file mode 100644
index 0ccc159ca9..0000000000
--- a/test/spark/custom_mylearner.py
+++ /dev/null
@@ -1,124 +0,0 @@
-from flaml.tune.spark.utils import broadcast_code
-
-custom_code = """
-from flaml import tune
-from flaml.automl.model import LGBMEstimator, XGBoostSklearnEstimator, SKLearnEstimator
-from flaml.automl.data import CLASSIFICATION, get_output_from_log
-
-class MyRegularizedGreedyForest(SKLearnEstimator):
-    def __init__(self, task="binary", **config):
-
-        super().__init__(task, **config)
-
-        if task in CLASSIFICATION:
-            from rgf.sklearn import RGFClassifier
-
-            self.estimator_class = RGFClassifier
-        else:
-            from rgf.sklearn import RGFRegressor
-
-            self.estimator_class = RGFRegressor
-
-    @classmethod
-    def search_space(cls, data_size, task):
-        space = {
-            "max_leaf": {
-                "domain": tune.lograndint(lower=4, upper=data_size[0]),
-                "init_value": 4,
-            },
-            "n_iter": {
-                "domain": tune.lograndint(lower=1, upper=data_size[0]),
-                "init_value": 1,
-            },
-            "n_tree_search": {
-                "domain": tune.lograndint(lower=1, upper=32768),
-                "init_value": 1,
-            },
-            "opt_interval": {
-                "domain": tune.lograndint(lower=1, upper=10000),
-                "init_value": 100,
-            },
-            "learning_rate": {"domain": tune.loguniform(lower=0.01, upper=20.0)},
-            "min_samples_leaf": {
-                "domain": tune.lograndint(lower=1, upper=20),
-                "init_value": 20,
-            },
-        }
-        return space
-
-    @classmethod
-    def size(cls, config):
-        max_leaves = int(round(config.get("max_leaf", 1)))
-        n_estimators = int(round(config.get("n_iter", 1)))
-        return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
-
-    @classmethod
-    def cost_relative2lgbm(cls):
-        return 1.0
-
-
-class MyLargeXGB(XGBoostSklearnEstimator):
-    @classmethod
-    def search_space(cls, **params):
-        return {
-            "n_estimators": {
-                "domain": tune.lograndint(lower=4, upper=32768),
-                "init_value": 32768,
-                "low_cost_init_value": 4,
-            },
-            "max_leaves": {
-                "domain": tune.lograndint(lower=4, upper=3276),
-                "init_value": 3276,
-                "low_cost_init_value": 4,
-            },
-        }
-
-
-class MyLargeLGBM(LGBMEstimator):
-    @classmethod
-    def search_space(cls, **params):
-        return {
-            "n_estimators": {
-                "domain": tune.lograndint(lower=4, upper=32768),
-                "init_value": 32768,
-                "low_cost_init_value": 4,
-            },
-            "num_leaves": {
-                "domain": tune.lograndint(lower=4, upper=3276),
-                "init_value": 3276,
-                "low_cost_init_value": 4,
-            },
-        }
-
-
-def custom_metric(
-    X_val,
-    y_val,
-    estimator,
-    labels,
-    X_train,
-    y_train,
-    weight_val=None,
-    weight_train=None,
-    config=None,
-    groups_val=None,
-    groups_train=None,
-):
-    from sklearn.metrics import log_loss
-    import time
-
-    start = time.time()
-    y_pred = estimator.predict_proba(X_val)
-    pred_time = (time.time() - start) / len(X_val)
-    val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
-    y_pred = estimator.predict_proba(X_train)
-    train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
-    alpha = 0.5
-    return val_loss * (1 + alpha) - alpha * train_loss, {
-        "val_loss": val_loss,
-        "train_loss": train_loss,
-        "pred_time": pred_time,
-    }
-"""
-
-_ = broadcast_code(custom_code=custom_code)
diff --git a/test/spark/mylearner.py b/test/spark/mylearner.py
deleted file mode 100644
index 980e371eea..0000000000
--- a/test/spark/mylearner.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from flaml.automl.model import LGBMEstimator
-from flaml import tune
-
-
-class MyLargeLGBM(LGBMEstimator):
-    @classmethod
-    def search_space(cls, **params):
-        return {
-            "n_estimators": {
-                "domain": tune.lograndint(lower=4, upper=32768),
-                "init_value": 32768,
-                "low_cost_init_value": 4,
-            },
-            "num_leaves": {
-                "domain": tune.lograndint(lower=4, upper=32768),
-                "init_value": 32768,
-                "low_cost_init_value": 4,
-            },
-        }
diff --git a/test/spark/test_automl.py b/test/spark/test_automl.py
deleted file mode 100644
index 79801c79e6..0000000000
--- a/test/spark/test_automl.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import numpy as np
-import scipy.sparse
-from flaml import AutoML
-from flaml.tune.spark.utils import check_spark
-import os
-import pytest
-
-# For spark, we need to put customized learner in a separate file
-if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "mylearner.py")):
-    try:
-        from test.spark.mylearner import MyLargeLGBM
-
-        skip_my_learner = False
-    except ImportError:
-        skip_my_learner = True
-        MyLargeLGBM = None
-else:
-    MyLargeLGBM = None
-    skip_my_learner = True
-
-os.environ["FLAML_MAX_CONCURRENT"] = "2"
-
-spark_available, _ = check_spark()
-skip_spark = not spark_available
-
-pytestmark = pytest.mark.skipif(
-    skip_spark, reason="Spark is not installed. Skip all spark tests."
-)
-
-
-def test_parallel_xgboost(hpo_method=None, data_size=1000):
-    automl_experiment = AutoML()
-    automl_settings = {
-        "time_budget": 10,
-        "metric": "ap",
-        "task": "classification",
-        "log_file_name": "test/sparse_classification.log",
-        "estimator_list": ["xgboost"],
-        "log_type": "all",
-        "n_jobs": 1,
-        "n_concurrent_trials": 2,
-        "hpo_method": hpo_method,
-        "use_spark": True,
-    }
-    X_train = scipy.sparse.eye(data_size)
-    y_train = np.random.randint(2, size=data_size)
-
-    automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-    print(automl_experiment.predict(X_train))
-    print(automl_experiment.model)
-    print(automl_experiment.config_history)
-    print(automl_experiment.best_model_for_estimator("xgboost"))
-    print(automl_experiment.best_iteration)
-    print(automl_experiment.best_estimator)
-
-
-def test_parallel_xgboost_others():
-    # use random search as the hpo_method
-    test_parallel_xgboost(hpo_method="random")
-
-
-@pytest.mark.skip(
-    reason="currently not supporting too large data, will support spark dataframe in the future"
-)
-def test_large_dataset():
-    test_parallel_xgboost(data_size=90000000)
-
-
-@pytest.mark.skipif(
-    skip_my_learner,
-    reason="please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
-)
-def test_custom_learner(data_size=1000):
-    automl_experiment = AutoML()
-    automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
-    automl_settings = {
-        "time_budget": 2,
-        "task": "classification",
-        "log_file_name": "test/sparse_classification_oom.log",
-        "estimator_list": ["large_lgbm"],
-        "log_type": "all",
-        "n_jobs": 1,
-        "hpo_method": "random",
-        "n_concurrent_trials": 2,
-        "use_spark": True,
-    }
-    X_train = scipy.sparse.eye(data_size)
-    y_train = np.random.randint(2, size=data_size)
-
-    automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-    print(automl_experiment.predict(X_train))
-    print(automl_experiment.model)
-    print(automl_experiment.config_history)
-    print(automl_experiment.best_model_for_estimator("large_lgbm"))
-    print(automl_experiment.best_iteration)
-    print(automl_experiment.best_estimator)
-
-
-if __name__ == "__main__":
-    test_parallel_xgboost()
-    test_parallel_xgboost_others()
-    # test_large_dataset()
-    if skip_my_learner:
-        print(
-            "please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file"
-        )
-    else:
-        test_custom_learner()
diff --git a/test/spark/test_ensemble.py b/test/spark/test_ensemble.py
deleted file mode 100644
index 42199c267b..0000000000
--- a/test/spark/test_ensemble.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import unittest
-from sklearn.datasets import load_wine
-from flaml import AutoML
-from flaml.tune.spark.utils import check_spark
-import os
-
-spark_available, _ = check_spark()
-skip_spark = not spark_available
-
-os.environ["FLAML_MAX_CONCURRENT"] = "2"
-
-# To solve pylint issue, we put code for customizing mylearner in a separate file
-if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "custom_mylearner.py")):
-    try:
-        from test.spark.custom_mylearner import *
-        from flaml.tune.spark.mylearner import MyRegularizedGreedyForest
-
-        skip_my_learner = False
-    except ImportError:
-        skip_my_learner = True
-else:
-    skip_my_learner = True
-
-
-class TestEnsemble(unittest.TestCase):
-    def setUp(self) -> None:
-        if skip_spark:
-            self.skipTest("Spark is not installed. Skip all spark tests.")
-
-    @unittest.skipIf(
-        skip_my_learner,
-        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
-    )
-    def test_ensemble(self):
-        automl = AutoML()
-        automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
-        X_train, y_train = load_wine(return_X_y=True)
-        settings = {
-            "time_budget": 5,  # total running time in seconds
-            "estimator_list": ["rf", "xgboost", "catboost"],
-            "task": "classification",  # task type
-            "sample": True,  # whether to subsample training data
-            "log_file_name": "test/wine.log",
-            "log_training_metric": True,  # whether to log training metric
-            "ensemble": {
-                "final_estimator": MyRegularizedGreedyForest(),
-                "passthrough": False,
-            },
-            "n_jobs": 1,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        automl.fit(X_train=X_train, y_train=y_train, **settings)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/spark/test_exceptions.py b/test/spark/test_exceptions.py
deleted file mode 100644
index 0c31b8aded..0000000000
--- a/test/spark/test_exceptions.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from flaml.automl.data import load_openml_dataset
-from flaml import AutoML
-from flaml.tune.spark.utils import check_spark
-import os
-import pytest
-
-spark_available, _ = check_spark()
-skip_spark = not spark_available
-
-pytestmark = pytest.mark.skipif(
-    skip_spark, reason="Spark is not installed. Skip all spark tests."
-)
-
-os.environ["FLAML_MAX_CONCURRENT"] = "2"
-
-
-def base_automl(n_concurrent_trials=1, use_ray=False, use_spark=False, verbose=0):
-    X_train, X_test, y_train, y_test = load_openml_dataset(
-        dataset_id=537, data_dir="./"
-    )
-    automl = AutoML()
-    settings = {
-        "time_budget": 3,  # total running time in seconds
-        "metric": "r2",  # primary metrics for regression can be chosen from: ['mae','mse','r2','rmse','mape']
-        "estimator_list": ["lgbm", "rf", "xgboost"],  # list of ML learners
-        "task": "regression",  # task type
-        "log_file_name": "houses_experiment.log",  # flaml log file
-        "seed": 7654321,  # random seed
-        "n_concurrent_trials": n_concurrent_trials,  # the maximum number of concurrent learners
-        "use_ray": use_ray,  # whether to use Ray for distributed training
-        "use_spark": use_spark,  # whether to use Spark for distributed training
-        "verbose": verbose,
-    }
-
-    automl.fit(X_train=X_train, y_train=y_train, **settings)
-
-    print("Best ML leaner:", automl.best_estimator)
-    print("Best hyperparmeter config:", automl.best_config)
-    print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
-    print(
-        "Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
-    )
-
-
-def test_both_ray_spark():
-    with pytest.raises(ValueError):
-        base_automl(n_concurrent_trials=2, use_ray=True, use_spark=True)
-
-
-def test_verboses():
-    for verbose in [1, 3, 5]:
-        base_automl(verbose=verbose)
-
-
-def test_import_error():
-    from importlib import reload
-    import flaml.tune.spark.utils as utils
-
-    reload(utils)
-    utils._have_spark = False
-    spark_available, spark_error_msg = utils.check_spark()
-    assert not spark_available
-    assert isinstance(spark_error_msg, ImportError)
-
-    reload(utils)
-    utils._spark_major_minor_version = (1, 1)
-    spark_available, spark_error_msg = utils.check_spark()
-    assert not spark_available
-    assert isinstance(spark_error_msg, ImportError)
-
-    reload(utils)
-
-
-if __name__ == "__main__":
-    base_automl()
-    test_import_error()
diff --git a/test/spark/test_multiclass.py b/test/spark/test_multiclass.py
deleted file mode 100644
index 9a2a3950a0..0000000000
--- a/test/spark/test_multiclass.py
+++ /dev/null
@@ -1,470 +0,0 @@
-import unittest
-import numpy as np
-import scipy.sparse
-from sklearn.datasets import load_iris, load_wine
-from flaml import AutoML
-from flaml.automl.data import CLASSIFICATION, get_output_from_log
-from flaml.automl.training_log import training_log_reader
-from flaml.tune.spark.utils import check_spark
-import os
-
-spark_available, _ = check_spark()
-skip_spark = not spark_available
-
-os.environ["FLAML_MAX_CONCURRENT"] = "2"
-
-# To solve pylint issue, we put code for customizing mylearner in a separate file
-if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "custom_mylearner.py")):
-    try:
-        from test.spark.custom_mylearner import *
-        from flaml.tune.spark.mylearner import (
-            MyRegularizedGreedyForest,
-            custom_metric,
-            MyLargeLGBM,
-            MyLargeXGB,
-        )
-
-        skip_my_learner = False
-    except ImportError:
-        skip_my_learner = True
-else:
-    skip_my_learner = True
-
-
-class TestMultiClass(unittest.TestCase):
-    def setUp(self) -> None:
-        if skip_spark:
-            self.skipTest("Spark is not installed. Skip all spark tests.")
-
-    @unittest.skipIf(
-        skip_my_learner,
-        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
-    )
-    def test_custom_learner(self):
-        automl = AutoML()
-        automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
-        X_train, y_train = load_wine(return_X_y=True)
-        settings = {
-            "time_budget": 8,  # total running time in seconds
-            "estimator_list": ["RGF", "lgbm", "rf", "xgboost"],
-            "task": "classification",  # task type
-            "sample": True,  # whether to subsample training data
-            "log_file_name": "test/wine.log",
-            "log_training_metric": True,  # whether to log training metric
-            "n_jobs": 1,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-            "verbose": 4,
-        }
-        automl.fit(X_train=X_train, y_train=y_train, **settings)
-        # print the best model found for RGF
-        print(automl.best_model_for_estimator("RGF"))
-
-        MyRegularizedGreedyForest.search_space = lambda data_size, task: {}
-        automl.fit(X_train=X_train, y_train=y_train, **settings)
-
-    @unittest.skipIf(
-        skip_my_learner,
-        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
-    )
-    def test_custom_metric(self):
-        df, y = load_iris(return_X_y=True, as_frame=True)
-        df["label"] = y
-        automl_experiment = AutoML()
-        automl_settings = {
-            "dataframe": df,
-            "label": "label",
-            "time_budget": 5,
-            "eval_method": "cv",
-            "metric": custom_metric,
-            "task": "classification",
-            "log_file_name": "test/iris_custom.log",
-            "log_training_metric": True,
-            "log_type": "all",
-            "n_jobs": 1,
-            "model_history": True,
-            "sample_weight": np.ones(len(y)),
-            "pred_time_limit": 1e-5,
-            # "ensemble": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        automl_experiment.fit(**automl_settings)
-        print(automl_experiment.classes_)
-        print(automl_experiment.model)
-        print(automl_experiment.config_history)
-        print(automl_experiment.best_model_for_estimator("rf"))
-        print(automl_experiment.best_iteration)
-        print(automl_experiment.best_estimator)
-        automl_experiment = AutoML()
-        estimator = automl_experiment.get_estimator_from_log(
-            automl_settings["log_file_name"], record_id=0, task="multiclass"
-        )
-        print(estimator)
-        (
-            time_history,
-            best_valid_loss_history,
-            valid_loss_history,
-            config_history,
-            metric_history,
-        ) = get_output_from_log(
-            filename=automl_settings["log_file_name"], time_budget=6
-        )
-        print(metric_history)
-
-    def test_classification(self, as_frame=False):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 4,
-            "metric": "accuracy",
-            "task": "classification",
-            "log_file_name": "test/iris.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
-        if as_frame:
-            # test drop column
-            X_train.columns = range(X_train.shape[1])
-            X_train[X_train.shape[1]] = np.zeros(len(y_train))
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        print(automl_experiment.classes_)
-        print(automl_experiment.predict(X_train)[:5])
-        print(automl_experiment.model)
-        print(automl_experiment.config_history)
-        print(automl_experiment.best_model_for_estimator("catboost"))
-        print(automl_experiment.best_iteration)
-        print(automl_experiment.best_estimator)
-        del automl_settings["metric"]
-        del automl_settings["model_history"]
-        del automl_settings["log_training_metric"]
-        automl_experiment = AutoML(task="classification")
-        duration = automl_experiment.retrain_from_log(
-            log_file_name=automl_settings["log_file_name"],
-            X_train=X_train,
-            y_train=y_train,
-            train_full=True,
-            record_id=0,
-        )
-        print(duration)
-        print(automl_experiment.model)
-        print(automl_experiment.predict_proba(X_train)[:5])
-
-    def test_micro_macro_f1(self):
-        automl_experiment_micro = AutoML()
-        automl_experiment_macro = AutoML()
-        automl_settings = {
-            "time_budget": 2,
-            "task": "classification",
-            "log_file_name": "test/micro_macro_f1.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True)
-        automl_experiment_micro.fit(
-            X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings
-        )
-        automl_experiment_macro.fit(
-            X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings
-        )
-        estimator = automl_experiment_macro.model
-        y_pred = estimator.predict(X_train)
-        y_pred_proba = estimator.predict_proba(X_train)
-        from flaml.automl.ml import norm_confusion_matrix, multi_class_curves
-
-        print(norm_confusion_matrix(y_train, y_pred))
-        from sklearn.metrics import roc_curve, precision_recall_curve
-
-        print(multi_class_curves(y_train, y_pred_proba, roc_curve))
-        print(multi_class_curves(y_train, y_pred_proba, precision_recall_curve))
-
-    def test_roc_auc_ovr(self):
-        automl_experiment = AutoML()
-        X_train, y_train = load_iris(return_X_y=True)
-        automl_settings = {
-            "time_budget": 1,
-            "metric": "roc_auc_ovr",
-            "task": "classification",
-            "log_file_name": "test/roc_auc_ovr.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "sample_weight": np.ones(len(y_train)),
-            "eval_method": "holdout",
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-
-    def test_roc_auc_ovo(self):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 1,
-            "metric": "roc_auc_ovo",
-            "task": "classification",
-            "log_file_name": "test/roc_auc_ovo.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True)
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-
-    def test_roc_auc_ovr_weighted(self):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 1,
-            "metric": "roc_auc_ovr_weighted",
-            "task": "classification",
-            "log_file_name": "test/roc_auc_weighted.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True)
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-
-    def test_roc_auc_ovo_weighted(self):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 1,
-            "metric": "roc_auc_ovo_weighted",
-            "task": "classification",
-            "log_file_name": "test/roc_auc_weighted.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True)
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-
-    def test_sparse_matrix_classification(self):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 2,
-            "metric": "auto",
-            "task": "classification",
-            "log_file_name": "test/sparse_classification.log",
-            "split_type": "uniform",
-            "n_jobs": 1,
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train = scipy.sparse.random(1554, 21, dtype=int)
-        y_train = np.random.randint(3, size=1554)
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        print(automl_experiment.classes_)
-        print(automl_experiment.predict_proba(X_train))
-        print(automl_experiment.model)
-        print(automl_experiment.config_history)
-        print(automl_experiment.best_model_for_estimator("extra_tree"))
-        print(automl_experiment.best_iteration)
-        print(automl_experiment.best_estimator)
-
-    @unittest.skipIf(
-        skip_my_learner,
-        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
-    )
-    def _test_memory_limit(self):
-        automl_experiment = AutoML()
-        automl_experiment.add_learner(
-            learner_name="large_lgbm", learner_class=MyLargeLGBM
-        )
-        automl_settings = {
-            "time_budget": -1,
-            "task": "classification",
-            "log_file_name": "test/classification_oom.log",
-            "estimator_list": ["large_lgbm"],
-            "log_type": "all",
-            "hpo_method": "random",
-            "free_mem_ratio": 0.2,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True, as_frame=True)
-
-        automl_experiment.fit(
-            X_train=X_train, y_train=y_train, max_iter=1, **automl_settings
-        )
-        print(automl_experiment.model)
-
-    @unittest.skipIf(
-        skip_my_learner,
-        "Please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
-    )
-    def test_time_limit(self):
-        automl_experiment = AutoML()
-        automl_experiment.add_learner(
-            learner_name="large_lgbm", learner_class=MyLargeLGBM
-        )
-        automl_experiment.add_learner(
-            learner_name="large_xgb", learner_class=MyLargeXGB
-        )
-        automl_settings = {
-            "time_budget": 0.5,
-            "task": "classification",
-            "log_file_name": "test/classification_timeout.log",
-            "estimator_list": ["catboost"],
-            "log_type": "all",
-            "hpo_method": "random",
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True, as_frame=True)
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        print(automl_experiment.model.params)
-        automl_settings["estimator_list"] = ["large_xgb"]
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        print(automl_experiment.model)
-        automl_settings["estimator_list"] = ["large_lgbm"]
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        print(automl_experiment.model)
-
-    def test_fit_w_starting_point(self, as_frame=True):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 3,
-            "metric": "accuracy",
-            "task": "classification",
-            "log_file_name": "test/iris.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
-        if as_frame:
-            # test drop column
-            X_train.columns = range(X_train.shape[1])
-            X_train[X_train.shape[1]] = np.zeros(len(y_train))
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        automl_val_accuracy = 1.0 - automl_experiment.best_loss
-        print("Best ML leaner:", automl_experiment.best_estimator)
-        print("Best hyperparmeter config:", automl_experiment.best_config)
-        print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
-        print(
-            "Training duration of best run: {0:.4g} s".format(
-                automl_experiment.best_config_train_time
-            )
-        )
-
-        starting_points = automl_experiment.best_config_per_estimator
-        print("starting_points", starting_points)
-        print("loss of the starting_points", automl_experiment.best_loss_per_estimator)
-        automl_settings_resume = {
-            "time_budget": 2,
-            "metric": "accuracy",
-            "task": "classification",
-            "log_file_name": "test/iris_resume.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "model_history": True,
-            "log_type": "all",
-            "starting_points": starting_points,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        new_automl_experiment = AutoML()
-        new_automl_experiment.fit(
-            X_train=X_train, y_train=y_train, **automl_settings_resume
-        )
-
-        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
-        print("Best ML leaner:", new_automl_experiment.best_estimator)
-        print("Best hyperparmeter config:", new_automl_experiment.best_config)
-        print(
-            "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
-        )
-        print(
-            "Training duration of best run: {0:.4g} s".format(
-                new_automl_experiment.best_config_train_time
-            )
-        )
-
-    def test_fit_w_starting_points_list(self, as_frame=True):
-        automl_experiment = AutoML()
-        automl_settings = {
-            "time_budget": 3,
-            "metric": "accuracy",
-            "task": "classification",
-            "log_file_name": "test/iris.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "model_history": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
-        if as_frame:
-            # test drop column
-            X_train.columns = range(X_train.shape[1])
-            X_train[X_train.shape[1]] = np.zeros(len(y_train))
-        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
-        automl_val_accuracy = 1.0 - automl_experiment.best_loss
-        print("Best ML leaner:", automl_experiment.best_estimator)
-        print("Best hyperparmeter config:", automl_experiment.best_config)
-        print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
-        print(
-            "Training duration of best run: {0:.4g} s".format(
-                automl_experiment.best_config_train_time
-            )
-        )
-
-        starting_points = {}
-        log_file_name = automl_settings["log_file_name"]
-        with training_log_reader(log_file_name) as reader:
-            sample_size = 1000
-            for record in reader.records():
-                config = record.config
-                config["FLAML_sample_size"] = sample_size
-                sample_size += 1000
-                learner = record.learner
-                if learner not in starting_points:
-                    starting_points[learner] = []
-                starting_points[learner].append(config)
-        max_iter = sum([len(s) for k, s in starting_points.items()])
-        automl_settings_resume = {
-            "time_budget": 2,
-            "metric": "accuracy",
-            "task": "classification",
-            "log_file_name": "test/iris_resume_all.log",
-            "log_training_metric": True,
-            "n_jobs": 1,
-            "max_iter": max_iter,
-            "model_history": True,
-            "log_type": "all",
-            "starting_points": starting_points,
-            "append_log": True,
-            "n_concurrent_trials": 2,
-            "use_spark": True,
-        }
-        new_automl_experiment = AutoML()
-        new_automl_experiment.fit(
-            X_train=X_train, y_train=y_train, **automl_settings_resume
-        )
-
-        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
-        # print('Best ML leaner:', new_automl_experiment.best_estimator)
-        # print('Best hyperparmeter config:', new_automl_experiment.best_config)
-        print(
-            "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
-        )
-        # print('Training duration of best run: {0:.4g} s'.format(new_automl_experiment.best_config_train_time))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/spark/test_notebook.py b/test/spark/test_notebook.py
deleted file mode 100644
index d900d76f70..0000000000
--- a/test/spark/test_notebook.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import nbformat
-from nbconvert.preprocessors import ExecutePreprocessor
-from nbconvert.preprocessors import CellExecutionError
-from flaml.tune.spark.utils import check_spark
-import os
-import pytest
-
-spark_available, _ = check_spark()
-skip_spark = not spark_available
-
-pytestmark = pytest.mark.skipif(
-    skip_spark, reason="Spark is not installed. Skip all spark tests."
-)
-
-here = os.path.abspath(os.path.dirname(__file__))
-os.environ["FLAML_MAX_CONCURRENT"] = "2"
-
-
-def run_notebook(input_nb, output_nb="executed_notebook.ipynb", save=False):
-    try:
-        file_path = os.path.join(here, os.pardir, os.pardir, "notebook", input_nb)
-        with open(file_path) as f:
-            nb = nbformat.read(f, as_version=4)
-        ep = ExecutePreprocessor(timeout=600, kernel_name="python3")
-        ep.preprocess(nb, {"metadata": {"path": here}})
-    except CellExecutionError:
-        raise
-    except Exception as e:
-        print("\nIgnoring below error:\n", e, "\n\n")
-    finally:
-        if save:
-            with open(os.path.join(here, output_nb), "w", encoding="utf-8") as f:
-                nbformat.write(nb, f)
-
-
-def test_automl_lightgbm_test():
-    run_notebook("integrate_spark.ipynb")
-
-
-if __name__ == "__main__":
-    test_automl_lightgbm_test()
diff --git a/test/spark/test_performance.py b/test/spark/test_performance.py
deleted file mode 100644
index 2bf72b9454..0000000000
--- a/test/spark/test_performance.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import sys
-from openml.exceptions import OpenMLServerException
-from requests.exceptions import ChunkedEncodingError, SSLError
-from flaml.tune.spark.utils import check_spark
-import os
-import pytest
-
-spark_available, _ = check_spark()
-skip_spark = not spark_available
-
-pytestmark = pytest.mark.skipif(
-    skip_spark, reason="Spark is not installed. Skip all spark tests."
-)
-
-os.environ["FLAML_MAX_CONCURRENT"] = "2"
-
-
-def run_automl(budget=3, dataset_format="dataframe", hpo_method=None):
-    from flaml.automl.data import load_openml_dataset
-    import urllib3
-
-    performance_check_budget = 3600
-    if sys.platform == "darwin" or "nt" in os.name or "3.10" not in sys.version:
-        budget = 3  # revise the buget if the platform is not linux + python 3.10
-    if budget >= performance_check_budget:
-        max_iter = 60
-        performance_check_budget = None
-    else:
-        max_iter = None
-    try:
-        X_train, X_test, y_train, y_test = load_openml_dataset(
-            dataset_id=1169, data_dir="test/", dataset_format=dataset_format
-        )
-    except (
-        OpenMLServerException,
-        ChunkedEncodingError,
-        urllib3.exceptions.ReadTimeoutError,
-        SSLError,
-    ) as e:
-        print(e)
-        return
-
-    """ import AutoML class from flaml package """
-    from flaml import AutoML
-
-    automl = AutoML()
-    settings = {
-        "time_budget": budget,  # total running time in seconds
-        "max_iter": max_iter,  # maximum number of iterations
-        "metric": "accuracy",  # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
-        "task": "classification",  # task type
-        "log_file_name": "airlines_experiment.log",  # flaml log file
-        "seed": 7654321,  # random seed
-        "hpo_method": hpo_method,
-        "log_type": "all",
-        "estimator_list": [
-            "lgbm",
-            "xgboost",
-            "xgb_limitdepth",
-            "rf",
-            "extra_tree",
-        ],  # list of ML learners
-        "eval_method": "holdout",
-        "n_concurrent_trials": 2,
-        "use_spark": True,
-    }
-
-    """The main flaml automl API"""
-    automl.fit(X_train=X_train, y_train=y_train, **settings)
-
-    """ retrieve best config and best learner """
-    print("Best ML leaner:", automl.best_estimator)
-    print("Best hyperparmeter config:", automl.best_config)
-    print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
-    print(
-        "Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
-    )
-    print(automl.model.estimator)
-    print(automl.best_config_per_estimator)
-    print("time taken to find best model:", automl.time_to_find_best_model)
-
-    """ compute predictions of testing dataset """
-    y_pred = automl.predict(X_test)
-    print("Predicted labels", y_pred)
-    print("True labels", y_test)
-    y_pred_proba = automl.predict_proba(X_test)[:, 1]
-    """ compute different metric values on testing dataset """
-    from flaml.automl.ml import sklearn_metric_loss_score
-
-    accuracy = 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test)
-    print("accuracy", "=", accuracy)
-    print(
-        "roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test)
-    )
-    print("log_loss", "=", sklearn_metric_loss_score("log_loss", y_pred_proba, y_test))
-    if performance_check_budget is None:
-        assert accuracy >= 0.669, "the accuracy of flaml should be larger than 0.67"
-
-
-def test_automl_array():
-    run_automl(3, "array", "bs")
-
-
-def test_automl_performance():
-    run_automl(3600)
-
-
-if __name__ == "__main__":
-    test_automl_array()
-    test_automl_performance()
diff --git a/test/spark/test_tune.py b/test/spark/test_tune.py
deleted file mode 100644
index bbd482c821..0000000000
--- a/test/spark/test_tune.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import lightgbm as lgb
-import numpy as np
-from sklearn.datasets import load_breast_cancer
-from sklearn.metrics import accuracy_score
-from sklearn.model_selection import train_test_split
-from flaml import tune
-from flaml.automl.model import LGBMEstimator
-from flaml.tune.spark.utils import check_spark
-import os
-import pytest
-
-spark_available, _ = check_spark()
-skip_spark = not spark_available
-
-pytestmark = pytest.mark.skipif(
-    skip_spark, reason="Spark is not installed. Skip all spark tests."
-)
-
-os.environ["FLAML_MAX_CONCURRENT"] = "2"
-X, y = load_breast_cancer(return_X_y=True)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
-
-
-def train_breast_cancer(config):
-    params = LGBMEstimator(**config).params
-    train_set = lgb.Dataset(X_train, label=y_train)
-    gbm = lgb.train(params, train_set)
-    preds = gbm.predict(X_test)
-    pred_labels = np.rint(preds)
-    result = {
-        "mean_accuracy": accuracy_score(y_test, pred_labels),
-    }
-    return result
-
-
-def test_tune_spark():
-    flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
-    config_search_space = {
-        hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
-    }
-
-    analysis = tune.run(
-        train_breast_cancer,
-        metric="mean_accuracy",
-        mode="max",
-        config=config_search_space,
-        num_samples=-1,
-        time_budget_s=5,
-        use_spark=True,
-        verbose=3,
-    )
-
-    # print("Best hyperparameters found were: ", analysis.best_config)
-    print("The best trial's result: ", analysis.best_trial.last_result)
-
-
-if __name__ == "__main__":
-    test_tune_spark()
diff --git a/test/spark/test_utils.py b/test/spark/test_utils.py
deleted file mode 100644
index 757f458b54..0000000000
--- a/test/spark/test_utils.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from flaml.tune.spark.utils import (
-    with_parameters,
-    check_spark,
-    get_n_cpus,
-    get_broadcast_data,
-)
-from functools import partial
-from timeit import timeit
-import pytest
-
-try:
-    from pyspark.sql import SparkSession
-    import pyspark
-
-    spark_available, _ = check_spark()
-    skip_spark = not spark_available
-except ImportError:
-    print("Spark is not installed. Skip all spark tests.")
-    skip_spark = True
-
-pytestmark = pytest.mark.skipif(
-    skip_spark, reason="Spark is not installed. Skip all spark tests."
-)
-
-
-def test_with_parameters_spark():
-    def train(config, data=None):
-        if isinstance(data, pyspark.broadcast.Broadcast):
-            data = data.value
-        print(config, len(data))
-
-    data = ["a"] * 10**6
-
-    with_parameters_train = with_parameters(train, data=data)
-    partial_train = partial(train, data=data)
-
-    spark = SparkSession.builder.getOrCreate()
-    rdd = spark.sparkContext.parallelize(list(range(2)))
-
-    t_partial = timeit(
-        lambda: rdd.map(lambda x: partial_train(config=x)).collect(), number=5
-    )
-    print("python_partial_train: " + str(t_partial))
-
-    t_spark = timeit(
-        lambda: rdd.map(lambda x: with_parameters_train(config=x)).collect(),
-        number=5,
-    )
-    print("spark_with_parameters_train: " + str(t_spark))
-
-    # assert t_spark < t_partial
-
-
-def test_get_n_cpus_spark():
-    n_cpus = get_n_cpus()
-    assert isinstance(n_cpus, int)
-
-
-def test_broadcast_code():
-    from flaml.tune.spark.utils import broadcast_code
-    from flaml.automl.model import LGBMEstimator
-
-    custom_code = """
-    from flaml.automl.model import LGBMEstimator
-    from flaml import tune
-
-    class MyLargeLGBM(LGBMEstimator):
-        @classmethod
-        def search_space(cls, **params):
-            return {
-                "n_estimators": {
-                    "domain": tune.lograndint(lower=4, upper=32768),
-                    "init_value": 32768,
-                    "low_cost_init_value": 4,
-                },
-                "num_leaves": {
-                    "domain": tune.lograndint(lower=4, upper=32768),
-                    "init_value": 32768,
-                    "low_cost_init_value": 4,
-                },
-            }
-    """
-
-    _ = broadcast_code(custom_code=custom_code)
-    from flaml.tune.spark.mylearner import MyLargeLGBM
-
-    assert isinstance(MyLargeLGBM(), LGBMEstimator)
-
-
-def test_get_broadcast_data():
-    data = ["a"] * 10
-    spark = SparkSession.builder.getOrCreate()
-    bc_data = spark.sparkContext.broadcast(data)
-    assert get_broadcast_data(bc_data) == data
-
-
-if __name__ == "__main__":
-    test_with_parameters_spark()
-    test_get_n_cpus_spark()
-    test_broadcast_code()
-    test_get_broadcast_data()
diff --git a/test/tune/test_pytorch_cifar10.py b/test/tune/test_pytorch_cifar10.py
index 188d9750fb..2151bf281d 100644
--- a/test/tune/test_pytorch_cifar10.py
+++ b/test/tune/test_pytorch_cifar10.py
@@ -313,11 +313,7 @@ def cifar10_main(
             best_trained_model = nn.DataParallel(best_trained_model)
     best_trained_model.to(device)
 
-    checkpoint_value = (
-        getattr(best_trial.checkpoint, "dir_or_data", None)
-        or best_trial.checkpoint.value
-    )
-    checkpoint_path = os.path.join(checkpoint_value, "checkpoint")
+    checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")
 
     model_state, optimizer_state = torch.load(checkpoint_path)
     best_trained_model.load_state_dict(model_state)
diff --git a/test/tune/test_searcher.py b/test/tune/test_searcher.py
index ff29b522cf..440c08ab0b 100644
--- a/test/tune/test_searcher.py
+++ b/test/tune/test_searcher.py
@@ -194,8 +194,8 @@ def test_searcher():
     searcher.on_trial_complete("t2", None, True)
     searcher.suggest("t3")
     searcher.on_trial_complete("t3", {"m": np.nan})
-    searcher.save("test/tune/optuna.pkl")
-    searcher.restore("test/tune/optuna.pkl")
+    searcher.save("test/tune/optuna.pickle")
+    searcher.restore("test/tune/optuna.pickle")
     try:
         searcher = BlendSearch(
             metric="m", global_search_alg=searcher, metric_constraints=[("c", "<", 1)]
diff --git a/website/docs/Examples/Tune-Lexicographic-objectives.md b/website/docs/Examples/Tune-Lexicographic-objectives.md
index b215c37282..c7fff54631 100644
--- a/website/docs/Examples/Tune-Lexicographic-objectives.md
+++ b/website/docs/Examples/Tune-Lexicographic-objectives.md
@@ -5,7 +5,6 @@
 ```python
 pip install "flaml>=1.1.0" thop torchvision torch
 ```
-Tuning multiple objectives with Lexicographic preference is a new feature added in version 1.1.0 and is subject to change in future versions.
 
 ## Tuning accurate and efficient neural networks with lexicographic preference
 
@@ -163,4 +162,4 @@ analysis = tune.run(
 ```
 
 
-[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb)
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_lexicographic.ipynb)
\ No newline at end of file
diff --git a/website/docs/Examples/Tune-PyTorch.md b/website/docs/Examples/Tune-PyTorch.md
index d75c716c7f..83f38e6098 100644
--- a/website/docs/Examples/Tune-PyTorch.md
+++ b/website/docs/Examples/Tune-PyTorch.md
@@ -261,8 +261,7 @@ if torch.cuda.is_available():
         best_trained_model = nn.DataParallel(best_trained_model)
 best_trained_model.to(device)
 
-checkpoint_value = getattr(best_trial.checkpoint, "dir_or_data", None) or best_trial.checkpoint.value
-checkpoint_path = os.path.join(checkpoint_value, "checkpoint")
+checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")
 
 model_state, optimizer_state = torch.load(checkpoint_path)
 best_trained_model.load_state_dict(model_state)
@@ -284,4 +283,4 @@ Files already downloaded and verified
 Best trial test set accuracy: 0.6294
 ```
 
-[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb)
+[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/tune_pytorch.ipynb)
\ No newline at end of file
diff --git a/website/docs/FAQ.md b/website/docs/FAQ.md
index 232e390c05..2fdbcd2fd2 100644
--- a/website/docs/FAQ.md
+++ b/website/docs/FAQ.md
@@ -66,16 +66,3 @@ Packages such as `azureml-interpret` and `sklearn.inspection.permutation_importa
 Model explanation is frequently asked and adding a native support may be a good feature. Suggestions/contributions are welcome.
 
 Optimization history can be checked from the [log](Use-Cases/Task-Oriented-AutoML#log-the-trials). You can also [retrieve the log and plot the learning curve](Use-Cases/Task-Oriented-AutoML#plot-learning-curve).
-
-
-### How to resolve out-of-memory error in `AutoML.fit()`
-
-* Set `free_mem_ratio` a float between 0 and 1. For example, 0.2 means try to keep free memory above 20% of total memory. Training may be early stopped for memory consumption reason when this is set.
-* Set `model_history` False.
-* If your data are already preprocessed, set `skip_transform` False. If you can preprocess the data before the fit starts, this setting can save memory needed for preprocessing in `fit`.
-* If the OOM error only happens for some particular trials:
-    - set `use_ray` True. This will increase the overhead per trial but can keep the AutoML process running when a single trial fails due to OOM error.
-    - provide a more accurate [`size`](reference/automl/model#size) function for the memory bytes consumption of each config for the estimator causing this error.
-    - modify the [search space](Use-Cases/Task-Oriented-AutoML#a-shortcut-to-override-the-search-space) for the estimators causing this error.
-    - or remove this estimator from the `estimator_list`.
-* If the OOM error happens when ensembling, consider disabling ensemble, or use a cheaper ensemble option. ([Example](Use-Cases/Task-Oriented-AutoML#ensemble)).
diff --git a/website/docs/Installation.md b/website/docs/Installation.md
index 76ad85c561..7cc37943a1 100644
--- a/website/docs/Installation.md
+++ b/website/docs/Installation.md
@@ -50,28 +50,6 @@ pip install flaml[nlp]
 ```bash
 pip install flaml[ray]
 ```
-* spark
-> *Spark support is added in v1.1.0*
-```bash
-pip install flaml[spark]>=1.1.0
-```
-
-For cloud platforms such as [Azure Synapse](https://azure.microsoft.com/en-us/products/synapse-analytics/), Spark clusters are provided.
-But you may also need to install `Spark` manually when setting up your own environment.
-For latest Ubuntu system, you can install Spark 3.3.0 standalone version with below script.
-For more details of installing Spark, please refer to [Spark Doc](https://spark.apache.org/docs/latest/api/python/getting_started/install.html).
-```bash
-sudo apt-get update && sudo apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
-    ca-certificates-java ca-certificates openjdk-17-jdk-headless \
-    && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/*
-wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz?action=download" \
-    -O - | tar -xzC /tmp; archive=$(basename "spark-3.3.0/spark-3.3.0-bin-hadoop2.tgz") \
-    bash -c "sudo mv -v /tmp/\${archive/%.tgz/} /spark"
-export SPARK_HOME=/spark
-export PYTHONPATH=/spark/python/lib/py4j-0.10.9.5-src.zip:/spark/python
-export PATH=$PATH:$SPARK_HOME/bin
-```
-
 * nni
 ```bash
 pip install flaml[nni]
diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md
index 1ec2522526..94025e57e8 100644
--- a/website/docs/Use-Cases/Task-Oriented-AutoML.md
+++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md
@@ -382,11 +382,7 @@ and have ``split`` and ``get_n_splits`` methods with the same signatures.  To di
 
 When you have parallel resources, you can either spend them in training and keep the model search sequential, or perform parallel search. Following scikit-learn, the parameter `n_jobs` specifies how many CPU cores to use for each training job. The number of parallel trials is specified via the parameter `n_concurrent_trials`. By default, `n_jobs=-1, n_concurrent_trials=1`. That is, all the CPU cores (in a single compute node) are used for training a single model and the search is sequential. When you have more resources than what each single training job needs, you can consider increasing `n_concurrent_trials`.
 
-FLAML now support two backends for parallel tuning, i.e., `Ray` and `Spark`. You can use either of them, but not both for one tuning job.
-
-#### Parallel tuning with Ray
-
-To do parallel tuning with Ray, install the `ray` and `blendsearch` options:
+To do parallel tuning, install the `ray` and `blendsearch` options:
 ```bash
 pip install flaml[ray,blendsearch]
 ```
@@ -401,23 +397,6 @@ automl.fit(X_train, y_train, n_jobs=4, n_concurrent_trials=4)
 ```
 flaml will perform 4 trials in parallel, each consuming 4 CPU cores. The parallel tuning uses the [BlendSearch](Tune-User-Defined-Function##blendsearch-economical-hyperparameter-optimization-with-blended-search-strategy) algorithm.
 
-#### Parallel tuning with Spark
-
-To do parallel tuning with Spark, install the `spark` and `blendsearch` options:
-
-> *Spark support is added in v1.1.0*
-```bash
-pip install flaml[spark,blendsearch]>=1.1.0
-```
-
-For more details about installing Spark, please refer to [Installation](../Installation#Distributed-tuning).
-
-An example of using Spark for parallel tuning is:
-```python
-automl.fit(X_train, y_train, n_concurrent_trials=4, use_spark=True)
-```
-For Spark clusters, by default, we will launch one trial per executor. However, sometimes we want to launch more trials than the number of executors (e.g., local mode). In this case, we can set the environment variable `FLAML_MAX_CONCURRENT` to override the detected `num_executors`. The final number of concurrent trials will be the minimum of `n_concurrent_trials` and `num_executors`. Also, GPU training is not supported yet when use_spark is True.
-
 #### **Guidelines on parallel vs sequential tuning**
 
 **(1) Considerations on wall-clock time.**
diff --git a/website/docs/Use-Cases/Tune-User-Defined-Function.md b/website/docs/Use-Cases/Tune-User-Defined-Function.md
index 858d2d07c7..39f04eb863 100644
--- a/website/docs/Use-Cases/Tune-User-Defined-Function.md
+++ b/website/docs/Use-Cases/Tune-User-Defined-Function.md
@@ -100,14 +100,14 @@ If it is a numerical hyperparameter, you need to know whether it takes integer v
 
 ```python
 {
-    "learning_rate": tune.loguniform(lower=1 / 1024, upper=1.0),
+"learning_rate": tune.loguniform(lower=1 / 1024, upper=1.0),
 }
 ```
 When the search range of learning rate is small, it is more common to sample in the linear scale as shown in the following example,
 
 ```python
 {
-    "learning_rate": tune.uniform(lower=0.1, upper=0.2),
+"learning_rate": tune.uniform(lower=0.1, upper=0.2),
 }
 ```
 
@@ -117,7 +117,7 @@ When the search range of learning rate is small, it is more common to sample in
 When you have a desired quantization granularity for the hyperparameter change, you can use `tune.qlograndint` or `tune.qloguniform` to realize the quantization requirement. The following code example helps you realize the need for sampling uniformly in the range of 0.1 and 0.2 with increments of 0.02, i.e., the sampled learning rate can only take values in {0.1, 0.12, 0.14, 0.16, ..., 0.2},
 ```python
 {
-    "learning_rate": tune.quniform(lower=0.1, upper=0.2, q=0.02),
+"learning_rate": tune.uniform(lower=0.1, upper=0.2, q=0.02),
 }
 ```
 
@@ -290,13 +290,10 @@ The key difference between these two types of constraints is that the calculatio
 Related arguments:
 
 - `use_ray`: A boolean of whether to use ray as the backend.
-- `use_spark`: A boolean of whether to use spark as the backend.
 - `resources_per_trial`: A dictionary of the hardware resources to allocate per trial, e.g., `{'cpu': 1}`. Only valid when using ray backend.
 
 
-You can perform parallel tuning by specifying `use_ray=True` (requiring flaml[ray] option installed) or `use_spark=True`
-(requiring flaml[spark] option installed). You can also limit the amount of resources allocated per trial by specifying `resources_per_trial`,
-e.g., `resources_per_trial={'cpu': 2}` when `use_ray=True`.
+You can perform parallel tuning by specifying `use_ray=True` (requiring flaml[ray] option installed). You can also limit the amount of resources allocated per trial by specifying `resources_per_trial`, e.g., `resources_per_trial={'cpu': 2}`.
 
 ```python
 # require: pip install flaml[ray]
@@ -314,21 +311,6 @@ print(analysis.best_trial.last_result)  # the best trial's result
 print(analysis.best_config)  # the best config
 ```
 
-```python
-# require: pip install flaml[spark]
-analysis = tune.run(
-    evaluate_config,  # the function to evaluate a config
-    config=config_search_space,  # the search space defined
-    metric="score",
-    mode="min",  # the optimization mode, "min" or "max"
-    num_samples=-1,  # the maximal number of configs to try, -1 means infinite
-    time_budget_s=10,  # the time budget in seconds
-    use_spark=True,
-)
-print(analysis.best_trial.last_result)  # the best trial's result
-print(analysis.best_config)  # the best config
-```
-
 **A headsup about computation overhead.** When parallel tuning is used, there will be a certain amount of computation overhead in each trial. In case each trial's original cost is much smaller than the overhead, parallel tuning can underperform sequential tuning. Sequential tuning is recommended when compute resource is limited, and each trial can consume all the resources.
 
 
@@ -547,7 +529,7 @@ In the following example, we want to minimize `val_loss` and `pred_time` of the
 ```python
 lexico_objectives = {}
 lexico_objectives["metrics"] = ["val_loss", "pred_time"]
-lexico_objectives["modes"] = ["min", "min"]
+lexico_objectives["pred_time"] = ["min", "min"]
 lexico_objectives["tolerances"] = {"val_loss": 0.02, "pred_time": 0.0}
 lexico_objectives["targets"] = {"val_loss": -float('inf'), "pred_time": -float('inf')}