## Importing and loading everythng we will need to use.
# Load requests
import requests
# Load BeautiulSoup
from bs4 import BeautifulSoup
# Load Regular Expression Library
import re
# Load Headers
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36"}
# Load Google Search
from googlesearch import search

# Load MatPlotLib
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use("fivethirtyeight")

# Load Pandas
import pandas as pd
# Load JSON
import json

# Load Random Number Generator
import random
# Load Scipy Stats
import scipy.stats as stats


#Collect Team Data
teams = []
url = "https://c03mmwsf5i.execute-api.us-east-2.amazonaws.com/production/api_ranking/teams/?leagueids=4&nonull=true&page="
num_pages = 4
for i in range(num_pages):
    url = "https://c03mmwsf5i.execute-api.us-east-2.amazonaws.com/production/api_ranking/teams/?leagueids=4&nonull=true&page=" + str(i+1)
    teams.append(requests.get(url, headers=headers).json()['results'])


#Convert nested teams list into a new list 
data = []
for page in teams:
    for team in page:
        data.append(team)
data[0:3]

[{'id': 796,
  'name': 'Northern Arizona',
  'tfrrs_id': 'AZ_college_m_Northern_Arizona',
  'top_5_ability_average': 812,
  'top_runner': 802,
  'fifth_runner': 825,
  'sex': 'M'},
 {'id': 1340,
  'name': 'BYU',
  'tfrrs_id': 'UT_college_m_BYU',
  'top_5_ability_average': 814,
  'top_runner': 794,
  'fifth_runner': 829,
  'sex': 'M'},
 {'id': 325,
  'name': 'Oklahoma State',
  'tfrrs_id': 'OK_college_m_Oklahoma_State',
  'top_5_ability_average': 815,
  'top_runner': 802,
  'fifth_runner': 821,
  'sex': 'M'}]


for i in range(len(data)):
    query = data[i]["name"] + " men's cross country roster 2021-22"
    for url in search(query, tld="co.in", num=1, stop=1, pause=1.5):
        #Add first google search result to data
        data[i]["team_url"] = url


#Manually fix incorrect URLs 

#BYU
data[1]["team_url"] = "https://byucougars.com/roster/m-cross-country"

#Columbia
data[61]["team_url"] = "https://gocolumbialions.com/sports/cross-country/roster"

#Liberty
data[66]["team_url"] = "https://www.liberty.edu/flames/cross-country/roster/?gender=Male"

#Cal St. Fullerton
data[95]["team_url"] = "https://www.fullertontitans.com/sports/c-xc/2021-22/mwroster"

#Georgia
data[103]["team_url"] = "https://georgiadogs.com/sports/cross-country/roster"

#California (Berkeley)
data[112]["team_url"] = "https://calbears.com/sports/cross-country/roster"

#Marshall
data[251]["team_url"] = "https://herdzone.com/sports/cross-country/roster"

#Chattanooga
data[261]["team_url"] = "https://gomocs.com/sports/cross-country/roster/"

#Northwestern State (Louisiana)
data[298]["team_url"] = "https://nsudemons.com/sports/cross-country/roster"

#Southern (Baton Rouge)
data[300]["team_url"] = "https://gojagsports.com/sports/cxc/roster"


#Inputs a raw height (str) and converts into height in inches (int)
#Ex: "5'9" -> 69
def height_to_inches(heights):
    for i in range (len(heights)):
        inches = 0
        #Remove any non-digits
        num = re.sub("\D", "", heights[i])
        #First character of remaining string is height in feet, rest of characters are height in inches
        inches = int(num[0]) * 12 + int(num[1:])
        heights[i] = inches

    return heights


def sidearm_get_heights(text):
    heights = []
    soup = BeautifulSoup(text, "html.parser")
    men = soup.find("ul", {"id": "sidearm-m-roster"})
    if men:
        male_heights = men.find_all(
            "span", {"class": "sidearm-roster-player-height"})
        for height in male_heights:
            heights.append(height.text)
                
    return height_to_inches(heights)


def other_get_heights(text):
    heights = []
    soup = BeautifulSoup(text, "html.parser")
    if soup.find("table"):
        df = pd.read_html(text)[0]
        if "Ht" in df.columns:
            heights = height_to_inches(list(df["Ht"].dropna()))

        elif "Ht." in df.columns:
            heights = height_to_inches(list(df["Ht."].dropna()))

        elif "HT." in df.columns:
            heights = height_to_inches(list(df["HT."].dropna()))

        elif "Height" in df.columns:
            heights = height_to_inches(list(df["Height"].dropna()))

    return heights


for i in range(len(data)):
    url = data[i]['team_url']
    r = requests.get(url, headers=headers)
    if(r.status_code == 200):
        if re.search("sidearm", r.text):
            data[i]["heights"] = sidearm_get_heights(r.text)
        else:
            data[i]["heights"] = other_get_heights(r.text)
    else:
        data[i]["heights"] = []


#Slicing out non-distance runners

#Furman
data[24]["heights"] = [75, 67, 70, 74, 68, 69, 67, 70, 72, 71, 73, 71, 70, 72, 72, 70, 76]

#Georgetown
data[27]["heights"] = [68, 69, 69]

#Iona
data[33]["heights"] = [74, 69, 68, 72, 70, 72, 68, 70, 71, 70, 72, 71, 70, 74, 72, 72, 71, 69, 72, 70, 71, 67, 73, 73, 70, 70, 69]

#Alabama
data[34]["heights"] = []

#UMass Lowell
data[60]["heights"] = [70, 70, 69, 73, 69, 70, 71, 70, 75, 70, 69, 71, 70, 69, 68, 70, 70, 70, 67, 69, 73, 65, 73, 66, 69, 66, 70, 67, 68, 70, 70, 70, 72, 70, 74, 69, 70, 69, 68, 69, 71, 70]

#Liberty
results = []
heights = BeautifulSoup(requests.get(data[66]["team_url"], headers=headers).text, "html.parser").find_all("p", {"class": "playerDetails"})
for height in heights:
    results.append(height.text)
data[66]["heights"] = height_to_inches(results)

#Coastal Carolina
data[273]["heights"] = [73, 65]


data = json.dumps(data)
df = pd.read_json(data)
df


individuals_df = df.explode("heights").dropna()
individuals_df = individuals_df[["name", "top_5_ability_average", "top_runner", "fifth_runner", "heights"]]
individuals_df


atleast5 = individuals_df[individuals_df.groupby(["name"])["heights"].transform("size") >= 5]
atleast5


individuals_df["heights"].plot.hist(bins=[63,64,65,66,67,68,69,70,71,72,73,74,75,76,77])

<AxesSubplot:ylabel='Frequency'>


individuals_df["heights"].astype(float).describe()

count    1422.000000
mean       70.456399
std         2.516210
min        63.000000
25%        69.000000
50%        70.000000
75%        72.000000
max        77.000000
Name: heights, dtype: float64


average_df = pd.DataFrame(atleast5.groupby(["name", "top_5_ability_average", "top_runner", "fifth_runner"])["heights"].mean().sort_values())
pd.set_option("display.max_rows", 100)
average_df = average_df.reset_index()
average_df


fig, axs = plt.subplots(2, 2, figsize=(15,12))
axs[0, 0].scatter(average_df["heights"], average_df["top_5_ability_average"])
axs[0, 0].set(xlabel='average height (inches)', ylabel='top 5 ability average')

axs[0, 1].scatter(average_df["heights"], average_df["top_runner"])
axs[0, 1].set(xlabel='average height (inches)', ylabel='top runner')


axs[1, 0].scatter(average_df["heights"], average_df["fifth_runner"])
axs[1, 0].set(xlabel='average height (inches)', ylabel='fifth runner')

rand_nums = []
for i in range(len(average_df["heights"])):
    rand_nums.append(random.randint(min(average_df["top_5_ability_average"]), max(average_df["top_5_ability_average"])))

axs[1, 1].scatter(average_df["heights"], rand_nums)
axs[1, 1].set(xlabel='average height (inches)', ylabel='random numbers')

[Text(0.5, 0, 'average height (inches)'), Text(0, 0.5, 'random numbers')]


r_value, p_value = stats.pearsonr(average_df["heights"], average_df["top_5_ability_average"])
print("The Correlation Coefficient between average height and top 5 ability average is", round(r_value, 3), "with a P-value of P =", round(p_value, 3))

r_value, p_value = stats.pearsonr(average_df["heights"], average_df["top_runner"])
print("The Correlation Coefficient between average height and the top runner's ability is", round(r_value, 3), "with a P-value of P =", round(p_value, 3))

r_value, p_value = stats.pearsonr(average_df["heights"], average_df["fifth_runner"])
print("The Correlation Coefficient between average height and the fifth best runner's ability is", round(r_value, 3), "with a P-value of P =", round(p_value, 3))

r_value, p_value = stats.pearsonr(average_df["heights"], rand_nums)
print("The Correlation Coefficient between average height and random numbers is", round(r_value, 3), "with a P-value of P =", round(p_value, 3))

The Correlation Coefficient between average height and top 5 ability average is -0.287 with a P-value of P = 0.004
The Correlation Coefficient between average height and the top runner's ability is -0.255 with a P-value of P = 0.012
The Correlation Coefficient between average height and the fifth best runner's ability is -0.251 with a P-value of P = 0.013
The Correlation Coefficient between average height and random numbers is 0.123 with a P-value of P = 0.231

	id	name	tfrrs_id	top_5_ability_average	top_runner	fifth_runner	sex	team_url	heights
0	796	Northern Arizona	AZ_college_m_Northern_Arizona	812	802	825	M	https://nauathletics.com/sports/cross-country/...	[]
1	1340	BYU	UT_college_m_BYU	814	794	829	M	https://byucougars.com/roster/m-cross-country	[71, 70, 73, 72, 69, 71, 70, 66, 70, 69, 77, 7...
2	325	Oklahoma State	OK_college_m_Oklahoma_State	815	802	821	M	https://okstate.com/sports/mxct/roster	[]
3	943	Notre Dame	IN_college_m_Notre_Dame_IN	817	804	827	M	https://und.com/sports/cross/roster/	[]
4	320	Iowa State	IA_college_m_Iowa_State	817	797	825	M	https://cyclones.com/sports/cross-country/roster	[]
...	...	...	...	...	...	...	...	...	...
314	2095	St. Francis (N.Y.)	NY_college_m_St_Francis_NY	1090	1052	1128	M	https://sfcathletics.com/sports/mens-cross-cou...	[]
315	1563	Mississippi Valley	MS_college_m_Mississippi_Valley	1114	975	1211	M	https://mvsusports.com/sports/mens-cross-country	[]
316	1582	Tennessee St.	TN_college_m_Tennessee_St	1124	1031	1263	M	https://tsutigers.com/sports/mens-cross-countr...	[71, 70]
317	2390	Delaware State	DE_college_m_Delaware_State	1257	902	1484	M	https://dsuhornets.com/sports/mens-cross-count...	[]
318	2500	Coppin State	MD_college_m_Coppin_State	1400	1194	1565	M	https://coppinstatesports.com/sports/mens-cros...	[71, 72]

	name	top_5_ability_average	top_runner	fifth_runner	heights
0	Tulane	936	884	966	68.333333
1	St. Mary's (Cal.)	872	844	890	68.642857
2	Houston	913	877	951	68.666667
3	Presbyterian	1038	969	1089	68.666667
4	Houston Baptist	908	873	942	68.777778
5	Rhode Island	900	888	908	68.777778
6	Marshall	921	901	935	68.800000
7	Arkansas-Pine Bluff	969	947	986	68.833333
8	UNCW	964	947	982	69.000000
9	Sam Houston St.	890	858	915	69.153846
10	Idaho State	890	858	917	69.200000
11	Incarnate Word	888	866	900	69.333333
12	St. Peter's	1022	997	1060	69.500000
13	Nevada	869	851	879	69.500000
14	Oral Roberts	1022	901	1188	69.555556
15	Cal Poly	858	842	867	69.645161
16	Norfolk State	922	882	983	69.666667
17	UC Irvine	895	885	917	69.666667
18	UMass Lowell	853	844	860	69.738095
19	USC Upstate	918	900	929	69.818182
20	Florida Atlantic	997	929	1046	69.857143
21	San Diego	902	881	915	69.888889
22	LSU	874	853	897	69.900000
23	Arkansas	821	811	827	69.944444
24	Mercer	898	888	909	70.000000
25	Tennessee Tech	886	875	894	70.000000
26	Manhattan	886	875	897	70.055556
27	Wagner	902	885	922	70.062500
28	UC Davis	876	858	886	70.071429
29	Colgate	957	931	983	70.090909
30	St. Bonaventure	928	912	942	70.115385
31	UL-Lafayette	955	931	980	70.142857
32	Fordham	890	868	902	70.153846
33	CBU	845	834	859	70.200000
34	Campbell	878	798	915	70.222222
35	Stony Brook	857	849	867	70.250000
36	Buffalo	876	866	894	70.250000
37	Albany	921	894	951	70.250000
38	SE Missouri	909	896	920	70.250000
39	Binghamton	869	841	885	70.266667
40	New Orleans	879	865	902	70.272727
41	Southern Utah	834	820	847	70.300000
42	Brown	875	867	887	70.333333
43	UNC-Greensboro	885	872	896	70.333333
44	East Tenn. St.	867	858	876	70.357143
45	Marist	901	871	916	70.368421
46	Monmouth	886	878	894	70.384615
47	Kansas State	864	858	871	70.400000
48	South Alabama	873	830	909	70.416667
49	Winthrop	945	916	977	70.416667
50	Lipscomb	855	843	863	70.434783
51	Georgia	869	831	890	70.473684
52	Utah Valley	846	837	856	70.480000
53	Colorado	819	804	825	70.482759
54	Temple	865	838	874	70.500000
55	Fairfield	950	906	981	70.555556
56	Abilene Christian	898	888	907	70.562500
57	IUPUI	861	853	866	70.647059
58	UCLA	852	835	869	70.650000
59	Rider	895	886	902	70.655172
60	William and Mary	868	859	878	70.666667
61	North Florida	851	835	868	70.666667
62	Air Force	829	824	833	70.680000
63	BYU	814	794	829	70.708333
64	Longwood	969	943	991	70.714286
65	Long Beach St.	880	865	896	70.727273
66	Iona	836	818	852	70.740741
67	Gardner-Webb	1021	969	1087	70.750000
68	UMBC	907	888	921	70.764706
69	Cal St. Northridge	908	882	966	70.800000
70	Dayton	882	866	892	70.800000
71	Purdue Fort Wayne	898	880	911	70.833333
72	High Point	871	845	889	70.866667
73	Quinnipiac	920	902	930	70.875000
74	Columbia	853	840	864	70.875000
75	Liberty	856	839	866	70.875000
76	Navy	859	853	866	70.954545
77	Furman	833	828	842	71.000000
78	Canisius	900	880	930	71.000000
79	Lamar	876	851	902	71.000000
80	Charlotte	844	814	864	71.076923
81	Duke	842	839	845	71.137931
82	Pepperdine	881	866	900	71.181818
83	Clemson	884	876	891	71.181818
84	Rice	872	854	883	71.187500
85	Xavier (Ohio)	894	875	912	71.214286
86	Bellarmine	896	887	905	71.333333
87	Dartmouth	865	848	875	71.333333
88	Kansas	876	858	891	71.388889
89	La Salle	862	852	871	71.400000
90	Citadel	938	891	970	71.600000
91	Hofstra	917	865	985	71.636364
92	Northwestern St.	1015	943	1123	71.666667
93	Washington	825	814	832	71.818182
94	George Washington	883	870	893	71.818182
95	Seattle U.	911	883	926	71.857143
96	Grand Canyon	867	861	873	72.266667

Does Height Matter for Distance Running?

By Samuel Kellum

Table of contents:

1. Introduction

2. Data Extraction, Transform and Load

3. Exploratory Data Analysis and Data Visualization

4. Hypothesis Testing

5. Conclusion and Further Study

	name	top_5_ability_average	top_runner	fifth_runner	heights
1	BYU	814	794	829	71
1	BYU	814	794	829	70
1	BYU	814	794	829	73
1	BYU	814	794	829	72
1	BYU	814	794	829	69
...	...	...	...	...	...
312	Hampton	1046	1003	1072	68
316	Tennessee St.	1124	1031	1263	71
316	Tennessee St.	1124	1031	1263	70
318	Coppin State	1400	1194	1565	71
318	Coppin State	1400	1194	1565	72