import pandas as pd
from collections import defaultdict
import datetime
def read_excel(file_path):
df = pd.read_excel(file_path)
return df
def keyword_filter_and_count_by_year(df, company_column, date_column, text_column, keywords):
df[date_column] = pd.to_datetime(df[date_column]).dt.to_period('Y')
grouped_data = df.groupby([company_column, date_column])
summary_data = []
for (company, year), group in grouped_data:
num_filtered_rows = 0
keyword_counter = defaultdict(int)
for index, row in group.iterrows():
text = row[text_column]
row_keywords = set()
for keyword in keywords:
if keyword in text:
row_keywords.add(keyword)
keyword_counter[keyword] += 1
if row_keywords:
num_filtered_rows += 1
keyword_counts = {k: keyword_counter[k] for k in keywords}
summary_data.append((company, year, num_filtered_rows, *keyword_counts.values()))
return summary_data
def write_to_excel(summary_data, output_file_path, keywords):
columns = ['证券代码', '年份', '包含关键词的提问个数'] + keywords
summary_df = pd.DataFrame(summary_data, columns=columns)
summary_df.to_excel(output_file_path, index=False)
def main():
input_file_path = " "
output_file_path = ' '
company_column = 'Symbol'
date_column = 'QuestionDate'
text_column = 'QuestionContent'
keywords = [ ]
df = read_excel(input_file_path)
summary_data = keyword_filter_and_count_by_year(df, company_column, date_column, text_column, keywords)
write_to_excel(summary_data, output_file_path, keywords)
if __name__ == '__main__':
main()
因篇幅问题不能全部显示,请点此查看更多更全内容