python
from dpark import DparkContext
dpark_context = DparkContext()
data = dpark_context.textFile("data.txt")
cleaned_data = data.distinct()
converted_data = cleaned_data.map(lambda x: int(x))
converted_data.pprint()
dpark_context.stop()
python
from dpark import DparkContext
dpark_context = DparkContext()
data = dpark_context.textFile("data.txt")
word_count = data.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
word_count.pprint()
dpark_context.stop()
python
from dpark import DparkContext
from sklearn.ensemble import RandomForestClassifier
dpark_context = DparkContext()
train_data = dpark_context.textFile("train_data.txt")
# ...
model = RandomForestClassifier(n_estimators=100)
model.fit(train_features, train_labels)
test_data = dpark_context.textFile("test_data.txt")
# ...
test_predictions = model.predict(test_features)
test_predictions.pprint()
dpark_context.stop()