diff --git a/Project_2_Haoyue b/Project_2_Haoyue deleted file mode 100644 index 4e646a1..0000000 --- a/Project_2_Haoyue +++ /dev/null @@ -1,96 +0,0 @@ - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -# Data loading and Cleaning -airing = pd.read_csv('/Users/zhanghaoyue/Downloads/airing.csv') -ads = pd.read_csv('/Users/zhanghaoyue/Downloads/ads.csv') -airing['message'] = airing['message'].replace('mix', 'mixed') -ads = ads[ads['air_count'] != 0] - -# Analysis airtime and refrencen count -airing['start_time'] = pd.to_datetime(airing['start_time']) -airing['end_time'] = pd.to_datetime(airing['end_time']) - -airing['airtime'] = (airing['end_time'] - airing['start_time']).dt.total_seconds() - -print(airing.groupby('wp_identifier')['airtime'].agg(lambda x: x.max() - x.min()).sort_values(ascending= False)) - -airtime_avg = airing.groupby('wp_identifier')[['airtime']].agg(['mean', 'sum']).reset_index() -airtime_avg.columns = ['_'.join(col).strip('_') for col in airtime_avg.columns.to_flat_index()] - -airtime_ref = pd.merge(airtime_avg, ads, how='left', on='wp_identifier') -airtime_ref[['wp_identifier', 'airtime_mean','airtime_sum', 'reference_count']] - - -airtime_ref.plot(kind='scatter', x = 'airtime_mean', y = 'reference_count') -plt.xlabel('Airtime(second)' , fontsize = 14) -plt.ylabel('Reference Count', fontsize = 14) -plt.title('Single Airing Duration and Reference Count', fontsize=16) -plt.show() - -airtime_ref.plot(kind='scatter', x = 'airtime_sum', y = 'reference_count') -plt.xlabel('Airtime(total minutes)', fontsize = 14) -plt.ylabel('Reference Count', fontsize =14) -plt.title('Total Airing Time and Reference Count', fontsize=16) -plt.show() - -# Pivot and summrize Ads Tone for different Candidate -airing['cand_list'] = airing['candidates'].str.split(', ') -candidate_explode = airing.explode('cand_list', ignore_index=True) -candidate_loc = candidate_explode.groupby('cand_list')[['location']].nunique() -candidate_loc.sort_values(by='location') - -candidate_tone = candidate_explode.groupby(['cand_list', 'message'])[['wp_identifier']].count().reset_index() -candidate_tone_pivot = candidate_tone.pivot(index='cand_list', columns='message', values='wp_identifier') -candidate_tone_pivot_perc = candidate_tone_pivot.div(candidate_tone_pivot.sum(axis=1), axis=0) * 100 -candidate_tone_pivot -candidate_tone_pivot_perc - -# Relationshio between tone and reference count -loc_tone = pd.merge(candidate_tone_pivot_perc, candidate_loc, on='cand_list', how='left') -loc_tone.plot(x='location', y='con', kind='scatter') -plt.xlabel('Count of Cities', fontsize = 14) -plt.ylabel('Negative Messsge %', fontsize =14) -plt.title('Total Airing Time and Reference Count', fontsize=16) -plt.show() - -cand_tone_top_10 = loc_tone.sort_values(by='location', ascending=False).head(10) - -# Getting Subjects of Candidates -candidate_explode['subjects'] = candidate_explode['subjects'].str.split(', ') -candidate_explode_contnet = candidate_explode.explode('subjects', ignore_index=True) -candidate_contnet = candidate_explode_contnet.groupby(['cand_list', 'subjects'])[['wp_identifier']].count().reset_index() -candidate_contnet.sort_values(by='wp_identifier', ascending=False) - -# Analysis between gepgraphic reach and subjects -loc_content = pd.merge(candidate_loc, candidate_contnet, on='cand_list', how='left') -loc_tone.plot(x='location', y='con', kind='scatter') -plt.xlabel('Count of Cities', fontsize = 14) -plt.ylabel('Negative Messsge %', fontsize =14) -plt.title('Total Airing Time and Reference Count', fontsize=16) -plt.show() - -loc_content_count = loc_content.groupby('cand_list').agg({ - 'location': 'mean', # Apply mean to 'location' - 'subjects': 'nunique' # Apply unique count to 'subjects' -}) -loc_content_count.plot(x='location', y='subjects', kind='scatter') -plt.xlabel('Count of Cities', fontsize = 14) -plt.ylabel('Count of Subjects', fontsize =14) -plt.show() - -hillary_data = loc_content[loc_content['cand_list'] == 'Hillary Clinton'].nlargest(10, 'wp_identifier') -joy_data = loc_content[loc_content['cand_list'] == 'Roy Cooper'].nlargest(10, 'wp_identifier') - - -plt.barh(hillary_data['subjects'], hillary_data['wp_identifier'], color='skyblue') -plt.xlabel('Airing Count',fontsize = 14) -plt.ylabel('Subjects',fontsize = 14) -plt.title('Hillary Clinton',fontsize = 14) - -plt.barh(joy_data['subjects'], hillary_data['wp_identifier'], color='skyblue') -plt.xlabel('Airing Count',fontsize = 14) -plt.ylabel('Subjects',fontsize = 14) -plt.title('Roy Cooper',fontsize = 14)