pandas 随机森林分类器项目-预测在线新闻流行度

3gtaxfhh  于 2023-06-04  发布在  其他
关注(0)|答案(1)|浏览(299)
# Sample data for training (increased size)
news_headlines = [
    "Amazing breakthrough in AI technology",
    "New study reveals health benefits of green tea",
    "Tips for improving productivity at work",
    "Latest smartphone reviews and comparisons",
    "The impact of climate change on global economies",
    "Innovative startup receives funding for expansion",
    "Guide to creating stunning website designs",
    "Celebrity gossip and entertainment news",
    "Exploring the mysteries of deep space",
    "Delicious recipes for homemade desserts",
    "Upcoming conference on artificial intelligence",
    "Worldwide increase in renewable energy investments",
    "Interview with renowned author on their latest book",
    "Expert tips for successful weight loss",
    "Breaking news: major scientific discovery announced",
    "Insights into the future of virtual reality technology",
    "New study suggests link between exercise and brain health",
    "Analysis of the latest stock market trends",
    "Famous actor to star in upcoming blockbuster movie",
    "Exploring the beauty of underwater coral reefs",
    "Exciting new features announced for popular software",
    "The rise of remote work and its impact on businesses",
    "Healthcare advancements for better patient outcomes",
    "Top destinations for travel enthusiasts",
    "Emerging trends in fashion and style",
    "Innovations in sustainable energy solutions",
    "Tips for successful entrepreneurship",
    "The future of autonomous vehicles",
    "Artificial intelligence in everyday life",
    "Latest updates on the cryptocurrency market",
    "Techniques for effective time management",
    "Advancements in medical research",
    "Unveiling the latest smartphone models",
    "Tips for maintaining a healthy work-life balance",
    "The impact of social media on society",
    "New discoveries in space exploration",
    "Evolving trends in digital marketing",
    "Healthy habits for a strong immune system",
    "Insights into the future of virtual assistants",
    "Innovative approaches to renewable energy",
    "The influence of technology on education",
    "Current trends in home decor",
    "Exploring the wonders of wildlife photography"
]

# Target labels indicating whether the news became popular or not
labels = [1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1]


# Preprocess the text data using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(news_headlines).toarray()
y = np.array(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Get user input for news headline
user_input = input("Enter the news headline: ")
user_input_vector = vectorizer.transform([user_input]).toarray()

# Predict the popularity of the news headline
prediction = clf.predict(user_input_vector)

# Output the prediction result
if prediction[0] == 1:
    print("The news headline is likely to be popular.")
else:
    print("The news headline is not likely to be popular.")

# Evaluate the model accuracy on the test set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy:", accuracy)

# Plot the accuracy
plt.plot(range(len(y_test)), y_test, 'r-', label='Actual')
plt.plot(range(len(y_pred)), y_pred, 'b-', label='Predicted')
plt.xlabel('Sample index')
plt.ylabel('Popularity (1: Popular, 0: Not Popular)')
plt.title('Model Predictions vs. Actual')
plt.legend()
plt.show()

在这段代码中,无论输入是什么,它总是打印以下内容,以及如何使验证准确
新闻标题很可能会受欢迎。模型精度:0.3333333333333333

我尝试了所有可能的方法。请帮帮我我需要你的帮助
如何进行超参数优化

g6ll5ycj

g6ll5ycj1#

我认为使用逻辑回归分类器以及极性和主观性标签将产生比标记化值的随机森林更高的准确性。这里是我的随机森林结果在46%

news_headlines = [
    "Amazing breakthrough in AI technology",
    "New study reveals health benefits of green tea",
    "Tips for improving productivity at work",
    "Latest smartphone reviews and comparisons",
    "The impact of climate change on global economies",
    "Innovative startup receives funding for expansion",
    "Guide to creating stunning website designs",
    "Celebrity gossip and entertainment news",
    "Exploring the mysteries of deep space",
    "Delicious recipes for homemade desserts",
    "Upcoming conference on artificial intelligence",
    "Worldwide increase in renewable energy investments",
    "Interview with renowned author on their latest book",
    "Expert tips for successful weight loss",
    "Breaking news: major scientific discovery announced",
    "Insights into the future of virtual reality technology",
    "New study suggests link between exercise and brain health",
    "Analysis of the latest stock market trends",
    "Famous actor to star in upcoming blockbuster movie",
    "Exploring the beauty of underwater coral reefs",
    "Exciting new features announced for popular software",
    "The rise of remote work and its impact on businesses",
    "Healthcare advancements for better patient outcomes",
    "Top destinations for travel enthusiasts",
    "Emerging trends in fashion and style",
    "Innovations in sustainable energy solutions",
    "Tips for successful entrepreneurship",
    "The future of autonomous vehicles",
    "Artificial intelligence in everyday life",
    "Latest updates on the cryptocurrency market",
    "Techniques for effective time management",
    "Advancements in medical research",
    "Unveiling the latest smartphone models",
    "Tips for maintaining a healthy work-life balance",
    "The impact of social media on society",
    "New discoveries in space exploration",
    "Evolving trends in digital marketing",
    "Healthy habits for a strong immune system",
    "Insights into the future of virtual assistants",
    "Innovative approaches to renewable energy",
    "The influence of technology on education",
    "Current trends in home decor",
    "Exploring the wonders of wildlife photography"
]

# Target labels indicating whether the news became popular or not
target = [1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1]

polarity=[]
subjectivity=[]
for sentence in news_headlines:
    #print(sentence)
    my_valance=TextBlob(sentence)
    polarity.append(my_valance.sentiment.polarity)
    subjectivity.append(my_valance.sentiment.subjectivity)

polarity=[ 1 if abs(x)>0.3 else 0 for x in polarity]    
print(polarity)    #positive sentiment
subjectivity=[ 1 if x>0.2 else 0 for x in subjectivity]    
print(subjectivity)  #find strong emotion

nlp = spacy.load('en_core_web_sm')
stopwords=spacy.lang.en.stop_words.STOP_WORDS

wordnet_lemmatizer = WordNetLemmatizer()

sentences=[]
for sentence in news_headlines:
    word_list=nltk.word_tokenize(sentence)

    for word in list(word_list):  # iterating on a copy since removing will mess things up
        if word in stopwords or  word.isalnum==False:
            word_list.remove(word)
    
    lower_tokens = [token.lower() for token in word_list]
    print(lower_tokens)

    # Lemmatize all tokens into a new list: lemmatized
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in lower_tokens]
    sentences.append(lemmatized)

max_encoder_seq_length=max([len(sentence) for sentence in sentences])
print("encoder sequence length", max_encoder_seq_length)
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(sentences)
encoder_x_train = input_tokenizer.texts_to_sequences(sentences) 
encoder_input_data = pad_sequences(encoder_x_train,  maxlen=max_encoder_seq_length, padding='post') 
input_vocab_size = len(input_tokenizer.word_index) +1

encoder_input_data2=[]
polarity_subjectivity=[]
for index in range(len(encoder_input_data)):
    data=encoder_input_data[index]
    newItem=[]
    newItem.append(polarity[index])
    data=np.concatenate([data,newItem])
    newItem2=[]
    newItem2.append(subjectivity[index])
    data=np.concatenate([data,newItem2])
    encoder_input_data2.append(data)

encoder_input_data=encoder_input_data2    
max_encoder_seq_length=max([len(tokens) for tokens in encoder_input_data])

for index in range(len(target)):
    print(target[index],polarity[index],subjectivity[index])

X_train,X_test,y_train, y_test=train_test_split(encoder_input_data,target,test_size=0.3,random_state=42)
features_count=len(encoder_input_data[0])

pipeline= Pipeline([
    ('scaler',StandardScaler()),
   #('clf',RandomForestClassifier(n_estimators=400,max_depth=8,min_samples_leaf=3, max_features=max_encoder_seq_length, random_state=42))
    ('clf',RandomForestClassifier(n_estimators=400,max_depth=8,min_samples_leaf=3, max_features=features_count, random_state=42))
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
cm = confusion_matrix(y_test,y_pred)

print(cm)

print("Accuracy Score",accuracy_score(y_test,y_pred));

相关问题