import requests as req
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup


web = req.get('https://cmsc320.github.io/files/top-50-solar-flares.html')


soup = BeautifulSoup(web.text)


#print(soup.prettify())


table = soup.find('table')


# imports table from html to pandas data frame
df1 = pd.read_html(web.text, flavor="bs4")[0]


df1.columns = ['rank', 'x_class', 'date', 'region', 'start_time', 'max_time', 'end_time', 'movie']

df1


# removes movie column
df1 = df1.drop(columns='movie')

df1


# loops through rows and convert time columns to datetime columns
for i, row in df1.iterrows():
    df1.at[i, 'start_time'] = pd.to_datetime(row.at['date'] + " " + row.at['start_time'])
    df1.at[i, 'max_time'] = pd.to_datetime(row.at['date'] + " " + row.at['max_time'])
    if df1.at[i, 'max_time'] < df1.at[i, 'start_time']: # if flare max after midnight, add a day
        df1.at[i, 'max_time'] += pd.Timedelta(days=1)
    df1.at[i, 'end_time'] = pd.to_datetime(row.at['date'] + " " + row.at['end_time'])
    if df1.at[i, 'end_time'] < df1.at[i, 'start_time']: # if end after midnight, add a day
        df1.at[i, 'end_time'] += pd.Timedelta(days=1)
    
df1 = df1.drop(columns='date')

df1.columns = ['rank', 'x_class', 'region', 'start_datetime', 'max_datetime', 'end_datetime']


# replaces missing data with NaN
df1 = df1.replace({'-' : np.nan})

df1


web = req.get('https://cdaw.gsfc.nasa.gov/CME_list/radio/waves_type2.html')


soup = BeautifulSoup(web.text)


#print(soup.prettify())


# finds the <pre> containing the table and split its lines
pre = soup.find('pre')
lines = pre.text.splitlines()

# removes excess strings at beginning and end of list
for i in range(12):
    del lines[0]

del lines[len(lines) - 1]

#lines


# puts the lines into a data frame
df2 = pd.DataFrame(lines, columns=['string'])

df2


# splits strings by ' ' and expand each block into its own column
df2 = df2['string'].str.split(expand=True)

# drops columns caused by text to the right of table on website
df2 = df2.drop(range(14, 24), axis=1)

# assigns appropriate column names
df2.columns=['start_date', 'start_time', 'end_date', 'end_time', 'start_frequency', 'end_frequency', 'flare_location', 'flare_region', 'importance',  'cme_date', 'cme_time',  'cpa', 'width', 'speed']

df2


# while working on this step I wanted to be able to see all rows of the table, this removes the display limit
#pd.set_option('display.max_rows', None)


# replaces missing data with NaN
df2 = df2.replace({'????' : np.nan, '----' : np.nan , '-----' : np.nan, '------' : np.nan, '--/--' : np.nan, '--:--' : np.nan})


# adds column to indicate if row corresponds to a halo flare
df2.insert(len(df2.columns), 'is_halo', False)

# loops through rows and if halo flare, set cpa to NaN and is_halo to true
for i, row in df2.iterrows():
    if row.at['cpa'] == 'Halo':
        df2.at[i, 'cpa'] = np.nan
        df2.at[i, 'is_halo'] = True


# adds column to indicate if width is given as a lower bound
df2.insert(len(df2.columns), 'width_lower_bound', False)

# loops through rows and if width is lower bound, removes the '>' and set width_lower_bound to true
for i, row in df2.iterrows():
    if  str(row.at['width'])[0] == '>':
        df2.at[i, 'width'] = row.at['width'].replace('>', '')
        df2.at[i, 'width_lower_bound'] = True


# loops through rows and convert time columns to datetime columns
for i, row in df2.iterrows():
    
    # start
    inc = 0 # how many days to increment by (accounts for time = 24:00)
    date = row.at['start_date']
    time = row.at['start_time']
    if time == '24:00':
        time = '00:00'
        inc += 1
        
    datetime = pd.to_datetime(date + " " + time)
    datetime = datetime + pd.Timedelta(days=inc)
    df2.at[i, 'start_time'] = datetime
    
    # end
    inc = 0
    date = str(df2.at[i, 'start_time'].year) + "/" + row.at['end_date']
    time = row.at['end_time']
    if time == '24:00':
        time = '00:00'
        inc += 1
    
    datetime = pd.to_datetime(date + " " + time)
    datetime = datetime + pd.Timedelta(days=inc)
    if datetime < df2.at[i, 'start_time']: # if solar flare spanned new year
        new_year = datetime.year + 1
        datetime.replace(year=new_year)
    
    df2.at[i, 'end_time'] = datetime
    
    # cme
    if row.at['cme_date'] != row.at['cme_date']: # if NaN, this is true
        datetime = np.nan
    else:
        inc = 0
        date = str(df2.at[i, 'start_time'].year) + "/" + row.at['cme_date']
        time = row.at['cme_time']
        if time == '24:00':
            time = '00:00'
            inc += 1

        datetime = pd.to_datetime(date + " " + time)
        datetime = datetime + pd.Timedelta(days=inc)
        if datetime < df2.at[i, 'start_time']: # if cme after new year
            new_year = datetime.year + 1
            datetime.replace(year=new_year)
    
    df2.at[i, 'cme_time'] = datetime
    
df2 = df2.drop(columns=['start_date', 'end_date', 'cme_date'])
df2 = df2.rename(columns={'start_time': 'start_datetime', 'end_time': 'end_datetime', 'cme_time': 'cme_datetime'})

df2


# creates new data frame with NASA data, removes all NaN importance rows
df2_top50 = df2.dropna(subset=['importance'])

# loops through rows, drops non X-class flares and makes importnace a float so rows can be sorted
# (all of top 50 will be X-class, so this is okay)
for i, row in df2_top50.iterrows():
    if row.at['importance'][0] == 'X':
        df2_top50.at[i, 'importance'] = float(row.at['importance'].replace('X', ''))
    else:
        df2_top50 = df2_top50.drop(i)
df2_top50.rename(columns={'importance': 'x_class'}, inplace=True) # they're all X-class now

# sorts by X-class
df2_top50 = df2_top50.sort_values(by='x_class', ascending=False)

df2_top50.reset_index(inplace=True)

# drop all rows outside of the top 50
df2_top50 = df2_top50.drop(range(50, len(df2_top50)))


df2_top50

df1


df2.insert(len(df2.columns), 'rank', np.nan) # column for rank from SpaceWeaterLive

# loop through SWL flares and NASA flares, find the NASA flare with a starting datetime that's closest to
# the corresponding SWL one and set its rank to the corresponding SWL rank
for i1, row1 in df1.iterrows():
    lowest = abs(row1.at['start_datetime'] - df2.at[0, 'start_datetime']).total_seconds()
    lowest_i = 0
    for i2, row2 in df2.iterrows():
        current = abs(row1.at['start_datetime'] - row2.at['start_datetime']).total_seconds()
        rank = df2.at[i2, 'rank'] # rank that's already associated with row
        if rank != rank and current < lowest: # set lowest if lowest timedelta row's rank is NaN
            lowest = current
            lowest_i = i2
    if lowest < 21600: # must be less than 6 hours apart
        df2.at[lowest_i, 'rank'] = row1.at['rank']


df2.sort_values(by='rank').reset_index()


# new dataframe, only columns are datetime and rank
df = df2.loc[:, df2.columns.intersection(['start_datetime','rank'])]
df.columns = ['month', 'in_top_50']

# set each datetime to first day of that month
df['month'] = df['month'].astype('datetime64[M]') 

# change rank to boolean that classifies if a flare is in top 50
def rank_to_bool(rank):
    return not np.isnan(rank)
df['in_top_50'] = df['in_top_50'].apply(rank_to_bool)
df['not_in_top_50'] = ~df['in_top_50'] # oppposite of in_top_50

df


# groups data by month and totals occurences of in_top_50 and not_in_top_50
df = df.groupby(by=['month']).sum()

# fills gaps between months
idx = pd.date_range(df.index.min(), df.index.max(), freq='MS')
df = df.reindex(idx, fill_value='0').astype('int32')

df


# plots graph with formatting
ax = df.plot(kind='bar', stacked=True, figsize=(20,10), width=1.0, color=['blueviolet', 'lightsalmon'], title='Solar Flares per Month')

# removes x-tick label for each month except the first of each year
for i, t in enumerate(ax.get_xticklabels()):
    if ((i + 3) % 12) != 0:
        t.set_visible(False)

# makes each x tick label only display the year
_ = ax.set_xticklabels([x.strftime('%Y') for x in df.index], rotation=45) # _ is to suppress output

# sets y axis label
_ = ax.set_ylabel('Number of Solar Flares')

	rank	x_class	date	region	start_time	max_time	end_time	movie
0	1	X28+	2003/11/04	486	19:29	19:53	20:06	MovieView archive
1	2	X20+	2001/04/02	9393	21:32	21:51	22:03	MovieView archive
2	3	X17.2+	2003/10/28	486	09:51	11:10	11:24	MovieView archive
3	4	X17+	2005/09/07	808	17:17	17:40	18:03	MovieView archive
4	5	X14.4	2001/04/15	9415	13:19	13:50	13:55	MovieView archive
5	6	X10	2003/10/29	486	20:37	20:49	21:01	MovieView archive
6	7	X9.4	1997/11/06	8100	11:49	11:55	12:01	MovieView archive
7	8	X9.3	2017/09/06	2673	11:53	12:02	12:10	MovieView archive
8	9	X9	2006/12/05	930	10:18	10:35	10:45	MovieView archive
9	10	X8.3	2003/11/02	486	17:03	17:25	17:39	MovieView archive
10	11	X8.2	2017/09/10	2673	15:35	16:06	16:31	MovieView archive
11	12	X7.1	2005/01/20	720	06:36	07:01	07:26	MovieView archive
12	13	X6.9	2011/08/09	1263	07:48	08:05	08:08	MovieView archive
13	14	X6.5	2006/12/06	930	18:29	18:47	19:00	MovieView archive
14	15	X6.2	2005/09/09	808	19:13	20:04	20:36	MovieView archive
15	16	X6.2	2001/12/13	9733	14:20	14:30	14:35	MovieView archive
16	17	X5.7	2000/07/14	9077	10:03	10:24	10:43	MovieView archive
17	18	X5.6	2001/04/06	9415	19:10	19:21	19:31	MovieView archive
18	19	X5.4	2012/03/07	1429	00:02	00:24	00:40	MovieView archive
19	20	X5.4	2005/09/08	808	20:52	21:06	21:17	MovieView archive
20	21	X5.4	2003/10/23	486	08:19	08:35	08:49	MovieView archive
21	22	X5.3	2001/08/25	9591	16:23	16:45	17:04	MovieView archive
22	23	X4.9	2014/02/25	1990	00:39	00:49	01:03	MovieView archive
23	24	X4.9	1998/08/18	8307	22:10	22:19	22:28	View archive
24	25	X4.8	2002/07/23	39	00:18	00:35	00:47	MovieView archive
25	26	X4	2000/11/26	9236	16:34	16:48	16:56	MovieView archive
26	27	X3.9	2003/11/03	488	09:43	09:55	10:19	MovieView archive
27	28	X3.9	1998/08/19	8307	21:35	21:45	21:50	View archive
28	29	X3.8	2005/01/17	720	06:59	09:52	10:07	MovieView archive
29	30	X3.7	1998/11/22	8384	06:30	06:42	06:49	MovieView archive
30	31	X3.6	2005/09/09	808	09:42	09:59	10:08	MovieView archive
31	32	X3.6	2004/07/16	649	13:49	13:55	14:01	MovieView archive
32	33	X3.6	2003/05/28	365	00:17	00:27	00:39	MovieView archive
33	34	X3.4	2006/12/13	930	02:14	02:40	02:57	MovieView archive
34	35	X3.4	2001/12/28	9767	20:02	20:45	21:32	MovieView archive
35	36	X3.3	2013/11/05	1890	22:07	22:12	22:15	MovieView archive
36	37	X3.3	2002/07/20	39	21:04	21:30	21:54	MovieView archive
37	38	X3.3	1998/11/28	8395	04:54	05:52	06:13	MovieView archive
38	39	X3.2	2013/05/14	1748	00:00	01:11	01:20	MovieView archive
39	40	X3.1	2014/10/24	2192	21:07	21:41	22:13	MovieView archive
40	41	X3.1	2002/08/24	69	00:49	01:12	01:31	MovieView archive
41	42	X3	2002/07/15	30	19:59	20:08	20:14	MovieView archive
42	43	X2.8	2013/05/13	1748	15:48	16:05	16:16	MovieView archive
43	44	X2.8	2001/12/11	9733	07:58	08:08	08:14	MovieView archive
44	45	X2.8	1998/08/18	8307	08:14	08:24	08:32	View archive
45	46	X2.7	2015/05/05	2339	22:05	22:11	22:15	MovieView archive
46	47	X2.7	2003/11/03	488	01:09	01:30	01:45	MovieView archive
47	48	X2.7	1998/05/06	8210	07:58	08:09	08:20	MovieView archive
48	49	X2.6	2005/01/15	720	22:25	23:02	23:31	MovieView archive
49	50	X2.6	2001/09/24	9632	09:32	10:38	11:09	MovieView archive

	rank	x_class	date	region	start_time	max_time	end_time
0	1	X28+	2003/11/04	486	19:29	19:53	20:06
1	2	X20+	2001/04/02	9393	21:32	21:51	22:03
2	3	X17.2+	2003/10/28	486	09:51	11:10	11:24
3	4	X17+	2005/09/07	808	17:17	17:40	18:03
4	5	X14.4	2001/04/15	9415	13:19	13:50	13:55
5	6	X10	2003/10/29	486	20:37	20:49	21:01
6	7	X9.4	1997/11/06	8100	11:49	11:55	12:01
7	8	X9.3	2017/09/06	2673	11:53	12:02	12:10
8	9	X9	2006/12/05	930	10:18	10:35	10:45
9	10	X8.3	2003/11/02	486	17:03	17:25	17:39
10	11	X8.2	2017/09/10	2673	15:35	16:06	16:31
11	12	X7.1	2005/01/20	720	06:36	07:01	07:26
12	13	X6.9	2011/08/09	1263	07:48	08:05	08:08
13	14	X6.5	2006/12/06	930	18:29	18:47	19:00
14	15	X6.2	2005/09/09	808	19:13	20:04	20:36
15	16	X6.2	2001/12/13	9733	14:20	14:30	14:35
16	17	X5.7	2000/07/14	9077	10:03	10:24	10:43
17	18	X5.6	2001/04/06	9415	19:10	19:21	19:31
18	19	X5.4	2012/03/07	1429	00:02	00:24	00:40
19	20	X5.4	2005/09/08	808	20:52	21:06	21:17
20	21	X5.4	2003/10/23	486	08:19	08:35	08:49
21	22	X5.3	2001/08/25	9591	16:23	16:45	17:04
22	23	X4.9	2014/02/25	1990	00:39	00:49	01:03
23	24	X4.9	1998/08/18	8307	22:10	22:19	22:28
24	25	X4.8	2002/07/23	39	00:18	00:35	00:47
25	26	X4	2000/11/26	9236	16:34	16:48	16:56
26	27	X3.9	2003/11/03	488	09:43	09:55	10:19
27	28	X3.9	1998/08/19	8307	21:35	21:45	21:50
28	29	X3.8	2005/01/17	720	06:59	09:52	10:07
29	30	X3.7	1998/11/22	8384	06:30	06:42	06:49
30	31	X3.6	2005/09/09	808	09:42	09:59	10:08
31	32	X3.6	2004/07/16	649	13:49	13:55	14:01
32	33	X3.6	2003/05/28	365	00:17	00:27	00:39
33	34	X3.4	2006/12/13	930	02:14	02:40	02:57
34	35	X3.4	2001/12/28	9767	20:02	20:45	21:32
35	36	X3.3	2013/11/05	1890	22:07	22:12	22:15
36	37	X3.3	2002/07/20	39	21:04	21:30	21:54
37	38	X3.3	1998/11/28	8395	04:54	05:52	06:13
38	39	X3.2	2013/05/14	1748	00:00	01:11	01:20
39	40	X3.1	2014/10/24	2192	21:07	21:41	22:13
40	41	X3.1	2002/08/24	69	00:49	01:12	01:31
41	42	X3	2002/07/15	30	19:59	20:08	20:14
42	43	X2.8	2013/05/13	1748	15:48	16:05	16:16
43	44	X2.8	2001/12/11	9733	07:58	08:08	08:14
44	45	X2.8	1998/08/18	8307	08:14	08:24	08:32
45	46	X2.7	2015/05/05	2339	22:05	22:11	22:15
46	47	X2.7	2003/11/03	488	01:09	01:30	01:45
47	48	X2.7	1998/05/06	8210	07:58	08:09	08:20
48	49	X2.6	2005/01/15	720	22:25	23:02	23:31
49	50	X2.6	2001/09/24	9632	09:32	10:38	11:09

	rank	x_class	region	start_datetime	max_datetime	end_datetime
0	1	X28+	486	2003-11-04 19:29:00	2003-11-04 19:53:00	2003-11-04 20:06:00
1	2	X20+	9393	2001-04-02 21:32:00	2001-04-02 21:51:00	2001-04-02 22:03:00
2	3	X17.2+	486	2003-10-28 09:51:00	2003-10-28 11:10:00	2003-10-28 11:24:00
3	4	X17+	808	2005-09-07 17:17:00	2005-09-07 17:40:00	2005-09-07 18:03:00
4	5	X14.4	9415	2001-04-15 13:19:00	2001-04-15 13:50:00	2001-04-15 13:55:00
5	6	X10	486	2003-10-29 20:37:00	2003-10-29 20:49:00	2003-10-29 21:01:00
6	7	X9.4	8100	1997-11-06 11:49:00	1997-11-06 11:55:00	1997-11-06 12:01:00
7	8	X9.3	2673	2017-09-06 11:53:00	2017-09-06 12:02:00	2017-09-06 12:10:00
8	9	X9	930	2006-12-05 10:18:00	2006-12-05 10:35:00	2006-12-05 10:45:00
9	10	X8.3	486	2003-11-02 17:03:00	2003-11-02 17:25:00	2003-11-02 17:39:00
10	11	X8.2	2673	2017-09-10 15:35:00	2017-09-10 16:06:00	2017-09-10 16:31:00
11	12	X7.1	720	2005-01-20 06:36:00	2005-01-20 07:01:00	2005-01-20 07:26:00
12	13	X6.9	1263	2011-08-09 07:48:00	2011-08-09 08:05:00	2011-08-09 08:08:00
13	14	X6.5	930	2006-12-06 18:29:00	2006-12-06 18:47:00	2006-12-06 19:00:00
14	15	X6.2	808	2005-09-09 19:13:00	2005-09-09 20:04:00	2005-09-09 20:36:00
15	16	X6.2	9733	2001-12-13 14:20:00	2001-12-13 14:30:00	2001-12-13 14:35:00
16	17	X5.7	9077	2000-07-14 10:03:00	2000-07-14 10:24:00	2000-07-14 10:43:00
17	18	X5.6	9415	2001-04-06 19:10:00	2001-04-06 19:21:00	2001-04-06 19:31:00
18	19	X5.4	1429	2012-03-07 00:02:00	2012-03-07 00:24:00	2012-03-07 00:40:00
19	20	X5.4	808	2005-09-08 20:52:00	2005-09-08 21:06:00	2005-09-08 21:17:00
20	21	X5.4	486	2003-10-23 08:19:00	2003-10-23 08:35:00	2003-10-23 08:49:00
21	22	X5.3	9591	2001-08-25 16:23:00	2001-08-25 16:45:00	2001-08-25 17:04:00
22	23	X4.9	1990	2014-02-25 00:39:00	2014-02-25 00:49:00	2014-02-25 01:03:00
23	24	X4.9	8307	1998-08-18 22:10:00	1998-08-18 22:19:00	1998-08-18 22:28:00
24	25	X4.8	39	2002-07-23 00:18:00	2002-07-23 00:35:00	2002-07-23 00:47:00
25	26	X4	9236	2000-11-26 16:34:00	2000-11-26 16:48:00	2000-11-26 16:56:00
26	27	X3.9	488	2003-11-03 09:43:00	2003-11-03 09:55:00	2003-11-03 10:19:00
27	28	X3.9	8307	1998-08-19 21:35:00	1998-08-19 21:45:00	1998-08-19 21:50:00
28	29	X3.8	720	2005-01-17 06:59:00	2005-01-17 09:52:00	2005-01-17 10:07:00
29	30	X3.7	8384	1998-11-22 06:30:00	1998-11-22 06:42:00	1998-11-22 06:49:00
30	31	X3.6	808	2005-09-09 09:42:00	2005-09-09 09:59:00	2005-09-09 10:08:00
31	32	X3.6	649	2004-07-16 13:49:00	2004-07-16 13:55:00	2004-07-16 14:01:00
32	33	X3.6	365	2003-05-28 00:17:00	2003-05-28 00:27:00	2003-05-28 00:39:00
33	34	X3.4	930	2006-12-13 02:14:00	2006-12-13 02:40:00	2006-12-13 02:57:00
34	35	X3.4	9767	2001-12-28 20:02:00	2001-12-28 20:45:00	2001-12-28 21:32:00
35	36	X3.3	1890	2013-11-05 22:07:00	2013-11-05 22:12:00	2013-11-05 22:15:00
36	37	X3.3	39	2002-07-20 21:04:00	2002-07-20 21:30:00	2002-07-20 21:54:00
37	38	X3.3	8395	1998-11-28 04:54:00	1998-11-28 05:52:00	1998-11-28 06:13:00
38	39	X3.2	1748	2013-05-14 00:00:00	2013-05-14 01:11:00	2013-05-14 01:20:00
39	40	X3.1	2192	2014-10-24 21:07:00	2014-10-24 21:41:00	2014-10-24 22:13:00
40	41	X3.1	69	2002-08-24 00:49:00	2002-08-24 01:12:00	2002-08-24 01:31:00
41	42	X3	30	2002-07-15 19:59:00	2002-07-15 20:08:00	2002-07-15 20:14:00
42	43	X2.8	1748	2013-05-13 15:48:00	2013-05-13 16:05:00	2013-05-13 16:16:00
43	44	X2.8	9733	2001-12-11 07:58:00	2001-12-11 08:08:00	2001-12-11 08:14:00
44	45	X2.8	8307	1998-08-18 08:14:00	1998-08-18 08:24:00	1998-08-18 08:32:00
45	46	X2.7	2339	2015-05-05 22:05:00	2015-05-05 22:11:00	2015-05-05 22:15:00
46	47	X2.7	488	2003-11-03 01:09:00	2003-11-03 01:30:00	2003-11-03 01:45:00
47	48	X2.7	8210	1998-05-06 07:58:00	1998-05-06 08:09:00	1998-05-06 08:20:00
48	49	X2.6	720	2005-01-15 22:25:00	2005-01-15 23:02:00	2005-01-15 23:31:00
49	50	X2.6	9632	2001-09-24 09:32:00	2001-09-24 10:38:00	2001-09-24 11:09:00

	string
0	1997/04/01 14:00 04/01 14:15 8000 4000 S25...
1	1997/04/07 14:30 04/07 17:30 11000 1000 S28...
2	1997/05/12 05:15 05/14 16:00 12000 80 N21...
3	1997/05/21 20:20 05/21 22:00 5000 500 N05...
4	1997/09/23 21:53 09/23 22:16 6000 2000 S29...
...	...
515	2017/09/10 16:02 09/11 06:50 16000 150 S09...
516	2017/09/12 07:38 09/12 07:43 16000 13000 N08...
517	2017/09/17 11:45 09/17 12:35 16000 900 S08E...
518	2017/10/18 05:48 10/18 12:40 16000 400 S06E...
519	2019/05/03 23:52 05/04 00:16 13000 2300 N12...

	start_date	start_time	end_date	end_time	start_frequency	end_frequency	flare_location	flare_region	importance	cme_date	cme_time	cpa	width	speed
0	1997/04/01	14:00	04/01	14:15	8000	4000	S25E16	8026	M1.3	04/01	15:18	74	79	312
1	1997/04/07	14:30	04/07	17:30	11000	1000	S28E19	8027	C6.8	04/07	14:27	Halo	360	878
2	1997/05/12	05:15	05/14	16:00	12000	80	N21W08	8038	C1.3	05/12	05:30	Halo	360	464
3	1997/05/21	20:20	05/21	22:00	5000	500	N05W12	8040	M1.3	05/21	21:00	263	165	296
4	1997/09/23	21:53	09/23	22:16	6000	2000	S29E25	8088	C1.4	09/23	22:02	133	155	712
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
515	2017/09/10	16:02	09/11	06:50	16000	150	S09W92	-----	X8.3	09/10	16:00	Halo	360	3163
516	2017/09/12	07:38	09/12	07:43	16000	13000	N08E48	12680	C3.0	09/12	08:03	124	96	252
517	2017/09/17	11:45	09/17	12:35	16000	900	S08E170	-----	----	09/17	12:00	Halo	360	1385
518	2017/10/18	05:48	10/18	12:40	16000	400	S06E123	-----	----	10/18	08:00	85	146	1001
519	2019/05/03	23:52	05/04	00:16	13000	2300	N12E82	12740	C1.0	05/03	23:24	90	113	692

Part 1: Data scraping and preparation¶

Step 1: Scraping competitor's data¶

Step 2: Tidying the top 50 solar flare data¶

Step 3: Scraping the NASA data¶

Step 4: Tidying the NASA table¶

Part 2: Analysis¶

Question 1: Replication¶

Question 2: Integration¶

Question 3: Analysis¶

	start_datetime	end_datetime	start_frequency	end_frequency	flare_location	flare_region	importance	cme_datetime	cpa	width	speed	is_halo	width_lower_bound
0	1997-04-01 14:00:00	1997-04-01 14:15:00	8000	4000	S25E16	8026	M1.3	1997-04-01 15:18:00	74	79	312	False	False
1	1997-04-07 14:30:00	1997-04-07 17:30:00	11000	1000	S28E19	8027	C6.8	1997-04-07 14:27:00	NaN	360	878	True	False
2	1997-05-12 05:15:00	1997-05-14 16:00:00	12000	80	N21W08	8038	C1.3	1997-05-12 05:30:00	NaN	360	464	True	False
3	1997-05-21 20:20:00	1997-05-21 22:00:00	5000	500	N05W12	8040	M1.3	1997-05-21 21:00:00	263	165	296	False	False
4	1997-09-23 21:53:00	1997-09-23 22:16:00	6000	2000	S29E25	8088	C1.4	1997-09-23 22:02:00	133	155	712	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...
515	2017-09-10 16:02:00	2017-09-11 06:50:00	16000	150	S09W92	NaN	X8.3	2017-09-10 16:00:00	NaN	360	3163	True	False
516	2017-09-12 07:38:00	2017-09-12 07:43:00	16000	13000	N08E48	12680	C3.0	2017-09-12 08:03:00	124	96	252	False	False
517	2017-09-17 11:45:00	2017-09-17 12:35:00	16000	900	S08E170	NaN	NaN	2017-09-17 12:00:00	NaN	360	1385	True	False
518	2017-10-18 05:48:00	2017-10-18 12:40:00	16000	400	S06E123	NaN	NaN	2017-10-18 08:00:00	85	146	1001	False	False
519	2019-05-03 23:52:00	2019-05-04 00:16:00	13000	2300	N12E82	12740	C1.0	2019-05-03 23:24:00	90	113	692	False	False

	index	start_datetime	end_datetime	start_frequency	end_frequency	flare_location	flare_region	x_class	cme_datetime	cpa	width	speed	is_halo	width_lower_bound
0	240	2003-11-04 20:00:00	2003-11-05 00:00:00	10000	200	S19W83	10486	28	2003-11-04 19:54:00	NaN	360	2657	True	False
1	117	2001-04-02 22:05:00	2001-04-03 02:30:00	14000	250	N19W72	9393	20	2001-04-02 22:06:00	261	244	2505	False	False
2	233	2003-10-28 11:10:00	2003-10-30 00:00:00	14000	40	S16E08	10486	17	2003-10-28 11:30:00	NaN	360	2459	True	False
3	126	2001-04-15 14:05:00	2001-04-16 13:00:00	14000	40	S20W85	9415	14	2001-04-15 14:06:00	245	167	1199	False	False
4	234	2003-10-29 20:55:00	2003-10-30 00:00:00	11000	500	S15W02	10486	10	2003-10-29 20:54:00	NaN	360	2029	True	False
5	8	1997-11-06 12:20:00	1997-11-07 08:30:00	14000	100	S18W63	8100	9.4	1997-11-06 12:10:00	NaN	360	1556	True	False
6	514	2017-09-06 12:05:00	2017-09-07 08:00:00	16000	70	S08W33	12673	9.3	2017-09-06 12:24:00	NaN	360	1571	True	False
7	328	2006-12-05 10:50:00	2006-12-05 20:00:00	14000	250	S07E68	10930	9	NaN	NaN	NaN	NaN	False	False
8	237	2003-11-02 17:30:00	2003-11-03 01:00:00	12000	250	S14W56	10486	8.3	2003-11-02 17:30:00	NaN	360	2598	True	False
9	515	2017-09-10 16:02:00	2017-09-11 06:50:00	16000	150	S09W92	NaN	8.3	2017-09-10 16:00:00	NaN	360	3163	True	False
10	288	2005-01-20 07:15:00	2005-01-20 16:30:00	14000	25	N14W61	10720	7.1	2005-01-20 06:54:00	NaN	360	882	True	False
11	359	2011-08-09 08:20:00	2011-08-09 08:35:00	16000	4000	N17W69	11263	6.9	2011-08-09 08:12:00	NaN	360	1610	True	False
12	331	2006-12-06 19:00:00	2006-12-09 00:00:00	16000	30	S05E64	10930	6.5	NaN	NaN	NaN	NaN	False	False
13	317	2005-09-09 19:45:00	2005-09-09 22:00:00	10000	50	S12E67	10808	6.2	2005-09-09 19:48:00	NaN	360	2257	True	False
14	82	2000-07-14 10:30:00	2000-07-15 14:30:00	14000	80	N22W07	9077	5.7	2000-07-14 10:54:00	NaN	360	1674	True	False
15	121	2001-04-06 19:35:00	2001-04-07 01:50:00	14000	230	S21E31	9415	5.6	2001-04-06 19:30:00	NaN	360	1270	True	False
16	375	2012-03-07 01:00:00	2012-03-08 19:00:00	16000	30	N17E27	11429	5.4	2012-03-07 00:24:00	NaN	360	2684	True	False
17	135	2001-08-25 16:50:00	2001-08-25 23:00:00	8000	170	S17E34	9591	5.3	2001-08-25 16:50:00	NaN	360	1433	True	False
18	443	2014-02-25 00:56:00	2014-02-25 11:28:00	14000	100	S12E82	11990	4.9	2014-02-25 01:25:00	NaN	360	2147	True	False
19	193	2002-07-23 00:50:00	2002-07-23 04:00:00	11000	400	S13E72	10039	4.8	2002-07-23 00:42:00	NaN	360	2285	True	False
20	104	2000-11-26 17:00:00	2000-11-26 17:15:00	14000	7000	N18W38	9236	4	2000-11-26 17:06:00	NaN	360	980	True	False
21	239	2003-11-03 10:00:00	2003-11-03 12:30:00	6000	400	N08W77	10488	3.9	2003-11-03 10:06:00	293	103	1420	False	False
22	286	2005-01-17 10:00:00	2005-01-17 10:35:00	6100	1500	N15W25	10720	3.8	2005-01-17 09:54:00	NaN	360	2547	True	False
23	222	2003-05-28 01:00:00	2003-05-29 00:30:00	1000	200	S07W20	10365	3.6	2003-05-28 00:50:00	NaN	360	1366	True	False
24	332	2006-12-13 02:45:00	2006-12-13 10:40:00	12000	150	S06W23	10930	3.4	2006-12-13 02:54:00	NaN	360	1774	True	False
25	160	2001-12-28 20:35:00	2001-12-29 03:00:00	14000	350	S26E90	9756	3.4	2001-12-28 20:30:00	NaN	360	2216	True	False
26	192	2002-07-20 21:30:00	2002-07-20 22:20:00	10000	2000	S13E90	10039	3.3	2002-07-20 22:06:00	NaN	360	1941	True	False
27	404	2013-05-14 01:16:00	2013-05-14 08:20:00	16000	240	N08E77	11748	3.2	2013-05-14 01:25:00	NaN	360	2625	True	False
28	201	2002-08-24 01:45:00	2002-08-24 03:25:00	5000	400	S02W81	10069	3.1	2002-08-24 01:27:00	NaN	360	1913	True	False
29	403	2013-05-13 16:15:00	2013-05-13 19:10:00	16000	300	N11E85	11748	2.8	2013-05-13 16:07:00	NaN	360	1850	True	False
30	487	2015-05-05 22:24:00	2015-05-05 23:14:00	14000	500	N15E79	12339	2.7	2015-05-05 22:24:00	NaN	360	715	True	False
31	19	1998-05-06 08:25:00	1998-05-06 08:35:00	14000	5000	S11W65	8210	2.7	1998-05-06 08:29:00	309	190	1099	False	False
32	238	2003-11-03 01:15:00	2003-11-03 01:25:00	3000	1500	N10W83	10488	2.7	2003-11-03 01:59:00	304	65	827	False	False
33	284	2005-01-15 23:00:00	2005-01-17 00:00:00	3000	40	N15W05	10720	2.6	2005-01-15 23:06:00	NaN	360	2861	True	False
34	142	2001-09-24 10:45:00	2001-09-25 20:00:00	7000	30	S16E23	9632	2.6	2001-09-24 10:30:00	NaN	360	2402	True	False
35	9	1997-11-27 13:30:00	1997-11-27 14:00:00	14000	7000	N17E63	8113	2.6	1997-11-27 13:56:00	98	91	441	False	False
36	276	2004-11-10 02:25:00	2004-11-10 03:40:00	14000	1000	N09W49	10696	2.5	2004-11-10 02:26:00	NaN	360	3387	True	False
37	123	2001-04-10 05:24:00	2001-04-11 00:00:00	14000	100	S23W09	9415	2.3	2001-04-10 05:30:00	NaN	360	2411	True	False
38	99	2000-11-24 15:25:00	2000-11-24 22:00:00	14000	200	N22W07	9236	2.3	2000-11-24 15:30:00	NaN	360	1245	True	False
39	73	2000-06-06 15:20:00	2000-06-08 09:00:00	14000	40	N20E18	9026	2.3	2000-06-06 15:54:00	NaN	360	1119	True	False
40	345	2011-02-15 02:10:00	2011-02-15 07:00:00	16000	400	S20W12	11158	2.2	2011-02-15 02:24:00	NaN	360	669	True	False
41	318	2005-09-10 21:45:00	2005-09-11 01:00:00	14000	200	S13E47	10808	2.1	2005-09-10 21:52:00	NaN	360	1893	True	False
42	361	2011-09-06 22:30:00	2011-09-07 15:40:00	16000	150	N14W18	11283	2.1	2011-09-06 23:05:00	NaN	360	575	True	False
43	420	2013-10-25 15:08:00	2013-10-25 22:32:00	16000	200	S06E69	11882	2.1	2013-10-25 15:12:00	NaN	360	1081	True	False
44	7	1997-11-04 06:00:00	1997-11-05 04:30:00	14000	100	S14W33	8100	2.1	1997-11-04 06:10:00	NaN	360	785	True	False
45	98	2000-11-24 05:10:00	2000-11-24 15:00:00	14000	100	N20W05	9236	2	2000-11-24 05:30:00	NaN	360	1289	True	False
46	125	2001-04-12 10:20:00	2001-04-12 10:40:00	14000	7000	S19W43	9415	2	2001-04-12 10:31:00	NaN	360	1184	True	False
47	274	2004-11-07 16:25:00	2004-11-08 20:00:00	14000	60	N09W17	10696	2	2004-11-07 16:54:00	NaN	360	1759	True	False
48	285	2005-01-17 09:25:00	2005-01-17 16:00:00	14000	30	N15W25	10720	2	2005-01-17 09:30:00	NaN	360	2094	True	False
49	102	2000-11-25 19:00:00	2000-11-25 19:35:00	6000	2000	N20W23	9236	1.9	2000-11-25 19:31:00	NaN	360	671	True	False

	month	in_top_50	not_in_top_50
0	1997-04-01	False	True
1	1997-04-01	False	True
2	1997-05-01	False	True
3	1997-05-01	False	True
4	1997-09-01	False	True
...	...	...	...
515	2017-09-01	True	False
516	2017-09-01	False	True
517	2017-09-01	False	True
518	2017-10-01	False	True
519	2019-05-01	False	True

	in_top_50	not_in_top_50
1997-04-01	0	2
1997-05-01	0	2
1997-06-01	0	0
1997-07-01	0	0
1997-08-01	0	0
...	...	...
2019-01-01	0	0
2019-02-01	0	0
2019-03-01	0	0
2019-04-01	0	0
2019-05-01	0	1