import pandas as pd import numpy as np from bokeh.plotting import figure, show, gmap from bokeh.io import output_file, show, output_notebook, export_png, save, reset_output, export_svg from bokeh.palettes import brewer from bokeh.transform import dodge, factor_cmap from bokeh.models import ColumnDataSource, FactorRange, GMapOptions, HoverTool output_notebook() df_2018 = pd.read_csv("raw_data/y2018.csv")
print(df_2018.head(10), "\n") print(f"Electricity generating technologies in this dataset: {df_2018.Technology.unique()}") print(f"Number of power plants in this dataset: {df_2018.Plant_ID.nunique()}")
Nameplate Technology Latitude Longitude State Plant_ID 0 53.9 Hydroelectric 33.458665 -87.356823 AL 2 1 153.1 Natural Gas 31.006900 -88.010300 AL 3 2 153.1 Natural Gas 31.006900 -88.010300 AL 3 3 403.7 Coal 31.006900 -88.010300 AL 3 4 788.8 Coal 31.006900 -88.010300 AL 3 5 170.1 Natural Gas 31.006900 -88.010300 AL 3 6 170.1 Natural Gas 31.006900 -88.010300 AL 3 7 195.2 Natural Gas 31.006900 -88.010300 AL 3 8 170.1 Natural Gas 31.006900 -88.010300 AL 3 9 170.1 Natural Gas 31.006900 -88.010300 AL 3 Electricity generating technologies in this dataset: ['Hydroelectric' 'Natural Gas' 'Coal' 'Other' 'Nuclear' 'Wind' 'Solar'] Number of power plants in this dataset: 9062
Columns include the nameplate capacity, the generating technology, the location, and the Plant_ID.
Nameplate Capacity The nameplate capacity is the "rated" capacity of the power plant, but it's important to note that this is just a theoretical maximum. For example, a solar plant could be rated for 10 MW in sunny, perfect conditions, but in reality, it might only generate a fraction of that because the sun doesn't always shine. There is a name for this: Capacity Factor.
Below are capacity factors taken from Wikipedia for 2018. Natural gas is assumed to mostly be combined cycle (Cogen) and other is assumed by be biomass, geothermal, etc. Solar is assumed to be photovoltaic (PV)
Repeat Columns Notice rows 2-9. Looks like a repeat right (especially 5 and 6)? There are numerous rows that look like duplicates but are actually different modules of one power plant. For example, it could be two separate nuclear reactors at one site or two cogeneration trains at a natural gas plant. For the purposes of this analysis, it will be best to combine these with a sum these by technology to get the sense of the electricity generating capacity at a facility
capacity_factors = {"Hydroelectric": .428, "Wind": .374, "Solar": .261, "Coal": .54, "Natural Gas":.5, "Nuclear": .887, "Other": .5} electricity_classification = {"Hydroelectric": "Renewable", "Wind": "Renewable", "Solar": "Renewable", "Nuclear": "Renewable", "Other": "Renewable", "Coal": "Fossil Fuel", "Natural Gas": "Fossil Fuel"} # Creating a function in case we would like to repeat this data cleansing def cleanse_df(df, capacity_factors=capacity_factors, electricity_classification=electricity_classification): df = df.groupby(["Plant_ID", "Technology", "Latitude", "Longitude", "State"], as_index=False).sum() df["Classification"] = df["Technology"].map(electricity_classification) df["Electricity Output (MW)"] = np.round(df["Nameplate"] * df["Technology"].map(capacity_factors),1) return df df_2018_clean = cleanse_df(df_2018) # If a powerplant has two modes of generation (eg Coal and NG), the Plant_ID will still be repeated
largest_gen_df_2018 = df_2018_clean.sort_values("Electricity Output (MW)", ascending=False) \ .drop(columns=["Longitude", "Latitude"]) with pd.option_context('expand_frame_repr', False): print(largest_gen_df_2018.head(10))
Plant_ID Technology State Nameplate Classification Electricity Output (MW) 2007 6008 Nuclear AZ 4209.6 Renewable 3733.9 25 46 Nuclear AL 3494.0 Renewable 3099.2 2126 6163 Hydroelectric WA 6809.0 Renewable 2914.3 1627 3166 Nuclear PA 2876.4 Renewable 2551.4 2170 6251 Nuclear TX 2708.6 Renewable 2402.5 1647 3265 Nuclear SC 2666.7 Renewable 2365.4 2560 7722 Nuclear TN 2539.8 Renewable 2252.8 2084 6103 Nuclear PA 2532.0 Renewable 2245.9 2021 6023 Nuclear IL 2449.8 Renewable 2173.0 2020 6022 Nuclear IL 2449.8 Renewable 2173.0
It's no surprise that the 9/10 of the largest generators are nuclear plants.
The hydroelectric plant in Washington is the Grand Coulee Dam (image source: Wikipedia)
df_bars = df_2018_clean.copy() drop_cols = ["Plant_ID", "State", "Longitude", "Latitude"] # Convert to gigawatts for c in ["Electricity Output (MW)", "Nameplate"]: df_bars[c] = df_bars[c]/10**3 df_bars2 = df_bars.copy() # For next analysis tech_group = df_bars.drop(columns=drop_cols) \ .groupby("Technology") \ .agg({"Electricity Output (MW)": 'sum', "Nameplate": 'sum'}) \ .sort_values(by="Nameplate", ascending=False) \ .reset_index() source = ColumnDataSource(tech_group) p = figure(plot_height=400, x_range=tech_group["Technology"], title="Electricity Output by Technology", toolbar_location=None, y_axis_label = "Gigawatts") p.vbar(x="Technology", top="Electricity Output (MW)", width=.9, source=source, color="#e6e600") p.y_range.start = 0 p.xgrid.grid_line_color = None # output_file("images/output.html") # save(p) # export_svg(p, filename="output.svg") show(p)
df_bars2 = df_bars2.groupby("Technology") \ .sum() \ .reset_index() \ .sort_values("Nameplate", ascending=True) technologies = list(df_bars2["Technology"]) categories = ["Electricity Output", "Nameplate"] output = list(df_bars2["Electricity Output (MW)"]) nameplate = list(df_bars2["Nameplate"]) x = [(technology, category) for technology in technologies for category in categories] counts = sum(zip(output, nameplate), ()) source = ColumnDataSource(data=dict(x=x, counts=counts)) f = figure(y_range=FactorRange(*x), plot_width=800, x_axis_label = "Gigawatts") f.hbar(y='x', right='counts', width=.9, color="#e6e600", source=source, line_color="white", fill_color=factor_cmap('x', palette=['firebrick', 'olive'], factors=categories, start=1, end=2)) f.ygrid.grid_line_color = None f.x_range.start = 0 # output_file("images/name_and_output.html") # save(f) show(f)
These plots show that fossil fuels are still king, unfortunately. Aside from nuclear, the capacity factors for renewables are extremely limiting. Of all these technologies, combined cycle natural gas has the best turndown ability, so it will likely remain a core part of the energy mix for some time. Wind and solar still have plenty of room to grow and plenty of market share to take away from coal. I don't know enough about the latest and greatest in nuclear, but I imagine there is room for baseload supply to increase with more nuclear power. The safety issues and capital costs are complicated, however.
def plot_technology(df, Technology, color, start_coords=(37, -122)): """Plots the generating sites for a specific technology using Google Maps API start_coords should be in the form of latitude then longitude. Default location is near San Francisco """ df_g = df.copy() df_g = df_g[df_g["Technology"] == Technology] \ .drop(columns=["Technology", "Plant_ID", "State", "Classification"]) \ .reset_index(drop=True) source = ColumnDataSource( data=dict(lat=list(df_g["Latitude"]), lon=list(df_g["Longitude"]), Nameplate=list(df_g["Nameplate"]), Output=list(df_g["Electricity Output (MW)"])) ) tooltips = [("Nameplate", "@Nameplate"), ("Output", "@Output")] tools = "pan, wheel_zoom, box_select, reset" hover = HoverTool(tooltips=tooltips, mode='mouse') map_options = GMapOptions(lat=start_coords[0], lng = start_coords[1], map_type="satellite", zoom=5) with open("api.txt", "r") as api_file: api_key = api_file.read() g = gmap(api_key, map_options, title=f"{Technology} Generation in US", tools=tools) g.circle(x="lon", y="lat", size=9, line_color="black", fill_color=color, fill_alpha=.5, source=source) g.add_tools(hover) # output_file(f"images/{technology}_map.html") # save(g) reset_output() output_notebook() show(g) colors = {"Solar": "yellow", "Coal": "black", "Natural Gas": "navy", "Hydroelectric": "deepskyblue", "Nuclear": "lime", "Wind": "coral", "Other": "maroon"}
technology = "Solar" coords = (37.77, -122.42) # San Francisco plot_technology(df_2018_clean, technology, colors[technology], start_coords=coords)
technology = "Nuclear" coords = (41.49, -81.69) # Cleveland plot_technology(df_2018_clean, technology, colors[technology], start_coords=coords)
technology = "Wind" coords = (41.25, -95.93) # Omaha plot_technology(df_2018_clean, technology, colors[technology], start_coords=coords)
technology = "Hydroelectric" coords = (45.5, -122.67) # Portland plot_technology(df_2018_clean, technology, colors[technology], start_coords=coords)
technology = "Coal" coords = (40.44, -79.99) # Pittsburgh plot_technology(df_2018_clean, technology, colors[technology], start_coords=coords)
technology = "Natural Gas" coords = (33.74, -84.38) # Atlanta plot_technology(df_2018_clean, technology, colors[technology], start_coords=coords)
technology = "Other" coords = (32.77, -96.8) # Dallas plot_technology(df_2018_clean, technology, colors[technology], start_coords=coords)
# Copy and group data. Convert MW to GW drop_cols = ["Plant_ID", "Latitude", "Longitude", "Technology", "Nameplate"] df_state = df_2018_clean.copy() \ .drop(columns=drop_cols) \ .groupby(by=["State", "Classification"]).sum() \ .reset_index() df_state["Electricity Output"] = df_state["Electricity Output (MW)"]/1000 # Pesky Vermont has no fossil fuel generation (I am skeptical about this, but I am not investigating the data that closely) # Add 0 manually vermont = pd.Series(data={"State": "VT", "Classification": "Fossil Fuel", "Electricity Output": 0}) df_state = df_state.iloc[:92].append(vermont, ignore_index=True) \ .append(df_state.iloc[92:]) \ .drop(columns="Electricity Output (MW)") \ .reset_index(drop=True) # Plots states = list(df_state["State"].unique()) categories = ["Fossil Fuel", "Renewable"] fossil = list((df_state[df_state["Classification"] == "Fossil Fuel"])["Electricity Output"].round(0)) renew = list((df_state[df_state["Classification"] == "Renewable"])["Electricity Output"].round(0)) x = [(state, category) for state in states for category in categories] counts = sum(zip(fossil, renew), ()) source = ColumnDataSource(data=dict(x=x, counts=counts)) f = figure(y_range=FactorRange(*x), plot_width=800, plot_height=2000, x_axis_label = "Gigawatts") f.hbar(y='x', right='counts', width=.9, color="#e6e600", source=source, line_color="white", fill_color=factor_cmap('x', palette=['darkgrey', 'green'], factors=categories, start=1, end=2)) f.ygrid.grid_line_color = None f.x_range.start = 0 show(f)
Group by State Results This plot shows the electricity output is highest in Texas, which is corroborated by this EIA Article. Some smaller states don't show up at this scale, but Bokeh allows you to zoom in. I am surprised how well California, Illinois, and New York do on their renewable to fossil fuel ratios.
States that produce more renewable than fossil fuel power
States that are doing poorly (as judged by me)
years_dfs = {} output_array = np.zeros([18, 7]) # 18 years, 7 technologies years = range(2001,2019) for idx, y in enumerate(years): index = str(y) # Read files and normalize plant names df = pd.read_csv("raw_data/y" + index + ".csv") \ .rename(columns={"PLNTCODE": "Plant_ID", "PLANT_CODE": "Plant_ID"}) # Clean dataframe df_clean = cleanse_df(df) # Convert to Gigawatts for readability df_clean["Electricity Output"] = df_clean["Electricity Output (MW)"]/1000 # Drop unused and groupby technology. Create % Output for stacked chart df_clean = df_clean.drop(columns=["Plant_ID", "Latitude", "Longitude","State", "Electricity Output (MW)", "Nameplate"]) \ .groupby(by="Technology").sum() \ .reset_index() df_clean["% Output"] = 100*df_clean["Electricity Output"] / df_clean["Electricity Output"].sum() # Create output % array and years dictionary to store dfs output_array[idx,:] = np.array(df_clean["% Output"]).reshape(1,-1) years_dfs[index] = df_clean columns = list(df_clean["Technology"]) df_output_years = pd.DataFrame(data=output_array, columns=columns, index=years) p = figure(x_range=(2001,2018), y_range=(0,100), title="USA % of Total Electricity by Technology") p.grid.minor_grid_line_color= '#eeeeee' p.varea_stack(stackers=columns, x='index', color=brewer["Spectral"][7], legend_label=columns, source=df_output_years) p.legend.location = "bottom_left" show(p)
Would be nice to see this data from 2019 - 2021!
Solar is amazingly small but growing. The most obvious trend here is coal getting replaced with natural gas. Hydro and nuclear are fairly constant, which isn't surprising.