diff --git a/index.html b/index.html index 5b52879..0a6cea6 100644 --- a/index.html +++ b/index.html @@ -67,7 +67,7 @@ 4. How do we get and store the video statistics?
- ## Development environment + ## 1. Development environment ```bash $ mkdir tubestats $ cd tubestats @@ -80,18 +80,20 @@ ```
- ## Video information - - use `beautifulsoup`, `scraPY`, `selenium` + ## 2. Video information + - use `beautifulsoup`, `scraPY`, `selenium`? - YouTube Data API + - `google-api-python-client`
- ## Storing passwords + ## 3. Storing API Keys - Hard code? - `python-dotenv` - `.env` - `.gitignore`
+ ### 3. Storing API keys ``` # .env @@ -108,6 +110,63 @@ ```
+
+ ## 4. Get YouTube video statistics + - Access API + - Channel upload playlist + - Video statistics +
+
+ ### 4. Get YouTube video statistics + ```python + import googleapiclient.discovery + + load_dotenv() + api_service_name = 'youtube' + api_version = 'v3' + + youtube = googleapiclient.discovery.build( + api_service_name, + api_version, + developerKey=os.getenv('API_KEY')) + ``` + +
+
+
# tubestates/youtube_api.py
+
+upload_playlist_ID = channel_data['upload_playlist_ID']
+
+video_response = []
+next_page_token = None
+while True:
+    # obtaining video ID + titles
+    playlist_request = self.youtube.playlistItems().list(
+	    part='snippet,contentDetails',
+	    maxResults=50, # API Limit is 50
+	    pageToken=next_page_token,
+	    playlistId=upload_playlist_ID,
+	    )
+    playlist_response = playlist_request.execute()
+    # isolating video ID
+    vid_subset = [ vid_ID['contentDetails']['videoId'] 
+    			for vid_ID in playlist_response['items'] ]
+    # retrieving video statistics
+    vid_info_subset_request = self.youtube.videos().list(
+	part='snippet,contentDetails,statistics',
+	id=vid_subset
+	)
+    vid_info_subset_response = vid_info_subset_request.execute()
+    video_response.append(vid_info_subset_response)
+    # obtaining page token
+    next_page_token = playlist_response.get('nextPageToken') # get method used because token may not exist
+    if next_page_token is None:
+	break
+
+df = pd.json_normalize(video_response, 'items')
+return df
+					
+
## How does TubeStats work? ### Part 2 of 2 @@ -115,7 +174,197 @@ 6. How to test the code? 7. How to display the data and allow interaction? 8. How to account for variable input? +
+
+ ## 5. Organising code + ```bash [1-9|10-15|16-20|21] + tubestats/ + ├── data + │   ├── channel_data.pkl + │   └── video_data.pkl + ├── LICENSE + ├── Procfile + ├── README.MD + ├── requirements.txt + ├── setup.sh + ├── tests + │   ├── __init__.py + │   ├── test_settings.py + │   ├── test_youtube_api.py + │   ├── test_youtube_data.py + │   └── test_youtube_parser.py + ├── tubestats + │   ├── __init__.py + │   ├── youtube_api.py + │   ├── youtube_data.py + │   └── youtube_parser.py + └── youtube_presenter.py +
+
+ ## 6. Testing + ```python [|16-20] + # tests/tests_youtube_api.py + from tubestats.youtube_api import create_api, YouTubeAPI + from tests.test_settings import set_channel_ID_test_case + from pathlib import Path + + import pytest + import googleapiclient + import pandas + + def test_create_api(): + youtube = create_api() + assert isinstance(youtube, googleapiclient.discovery.Resource) + + @pytest.fixture() + def youtubeapi(): + channel_ID = set_channel_ID_test_case() + yt = YouTubeAPI(channel_ID) + return yt + + def test_get_video_data(youtubeapi): + df = youtubeapi.get_video_data() + assert isinstance(df, pandas.core.frame.DataFrame) + + # saving video data to save API calls for later testing + BASE_DIR = Path(__file__).parent.parent + df.to_pickle(BASE_DIR / 'data' / 'video_data.pkl') +
+
+ ## 7. Sharing to the world + - graphs with tool tips, `altair` + - creating interaction with `streamlit` + - hosting on Heroku +
+
+ ### 7. Sharing to the world + ```python [] + # tubestats/youtube_data.py + + import altair as alt + + def scatter_all_videos(self, df: pd.core.frame.DataFrame) -> alt.vegalite.v4.Chart: + df_views = df + c = alt.Chart(df_views, title='Plot of videos over time').mark_point().encode( + x=alt.X('snippet\.publishedAt_REFORMATED:T', axis=alt.Axis(title='Date Published')), + y=alt.Y('statistics\.viewCount_NLOG:Q', axis=alt.Axis(title='Natural Log of Views')), + color=alt.Color('statistics\.like-dislike-ratio:Q', scale=alt.Scale(scheme='turbo'), legend=None), + tooltip=['snippet\.title:N', 'statistics\.viewCount:Q', 'statistics\.like-dislike-ratio:Q'], + size=alt.Size('statistics\.viewCount:Q', legend=None) + ) + return c +
+
+ ### 7. Sharing to the world + ```python [|5-14|16-19] + # youtube_presenter.py + + import streamlit as st + + def date_slider(date_end=datetime.today()): + date_start, date_end = st.slider( + 'Select date range to include:', + min_value=first_video_date, # first video + max_value=last_video_date, #value for date_end + value=(first_video_date , last_video_date), #same as min value + step=timedelta(days=2), + format='YYYY-MM-DD', + key=999) + return date_start, date_end + + date_start, date_end = date_slider() + transformed_df = youtuber_data.transform_dataframe(date_start=date_start, date_end=date_end) + c = youtuber_data.scatter_all_videos(transformed_df) + st.altair_chart(c, use_container_width=True) +
+
+ ### 7. Sharing with the world + ```shell [] + $ streamlit run youtube_presenter.py + + You can now view your Streamlit app in your browser. + + Local URL: http://localhost:8501 + Network URL: http://192.0.0.0.1 +
+
+ ![](all-graph.png) +
+
+ ### 7. Sharing to the world + ```bash + $ (venv) pip freeze > requirements.txt + ``` + + ```bash + # setup.sh + + mkdir -p ~/.streamlit/echo "\ + [server]\n\ + headless = true\n\ + port = $PORT\n\ + enableCORS = false\n\ + \n\ + " > ~/.streamlit/config.toml + ``` + + ```bash + # Procfile + + web: sh setup.sh && streamlit run youtube_presenter.py + ``` + + ```bash + $ heroku login + $ heroku create tubestats + + $ git push heroku main + ``` +
+
+ ## 8. Different user input + - taking video ID + - URL links + - using regex, `re` module +
+
+ ### 8. Different user input + ![](regex.png) + ```python [] + import re + + LINK_MATCH = r'(^.*youtu)(\.be|be\.com)(\/watch\?v\=|\/)([a-zA-Z0-9_-]+)(\/)?([a-zA-Z0-9_-]+)?' + m = re.search(LINK_MATCH, for_parse) + video_id = m.group(4) # video ID + if video_id == 'channel': + return m.group(6) # Channel ID + elif video_id == 'user': + channel_username = m.group(6) # Channel Username +
+
+ ## Somethings I would like to discuss +
+
+ df = self.df + df = df[['snippet.publishedAt', + 'snippet.title', + ... + 'statistics.favoriteCount', + 'statistics.commentCount']] + + df = df.fillna(0) + + # changing dtypes + df = df.astype({'statistics.viewCount': 'int', + ... + 'statistics.commentCount': 'int',}) + # applying natural log to view count as data is tail heavy + df['statistics.viewCount_NLOG'] = df['statistics.viewCount'].apply(lambda x : np.log(x)) + + df = df.sort_values(by='snippet.publishedAt_REFORMATED', ascending=True) + return DataFrame) +
diff --git a/regex.png b/regex.png new file mode 100644 index 0000000..3785d4f Binary files /dev/null and b/regex.png differ