Skip to content

Commit 372c27f

Browse files
TupleTypeeyurtsev
andauthored
community[minor]: [GoogleApiYoutubeLoader] Replace API used in _get_document_for_channel from search to playlistItem (#24034)
- **Description:** Search has a limit of 500 results, playlistItems doesn't. Added a class in except clause to catch another common error. - **Issue:** None - **Dependencies:** None - **Twitter handle:** @TupleType --------- Co-authored-by: asi-cider <88270351+asi-cider@users.noreply.github.com> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
1 parent 6a45bf9 commit 372c27f

File tree

2 files changed

+20
-11
lines changed

2 files changed

+20
-11
lines changed

libs/community/langchain_community/document_loaders/youtube.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pathlib import Path
88
from typing import Any, Dict, Generator, List, Optional, Sequence, Union
99
from urllib.parse import parse_qs, urlparse
10+
from xml.etree.ElementTree import ParseError # OK: trusted-source
1011

1112
from langchain_core.documents import Document
1213
from langchain_core.pydantic_v1 import root_validator
@@ -28,6 +29,8 @@ class GoogleApiClient:
2829
As the google api expects credentials you need to set up a google account and
2930
register your Service. "https://developers.google.com/docs/api/quickstart/python"
3031
32+
*Security Note*: Note that parsing of the transcripts relies on the standard
33+
xml library but the input is viewed as trusted in this case.
3134
3235
3336
Example:
@@ -437,6 +440,14 @@ def _get_channel_id(self, channel_name: str) -> str:
437440
channel_id = response["items"][0]["id"]["channelId"]
438441
return channel_id
439442

443+
def _get_uploads_playlist_id(self, channel_id: str) -> str:
444+
request = self.youtube_client.channels().list(
445+
part="contentDetails",
446+
id=channel_id,
447+
)
448+
response = request.execute()
449+
return response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
450+
440451
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
441452
try:
442453
from youtube_transcript_api import (
@@ -452,34 +463,32 @@ def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Documen
452463
)
453464

454465
channel_id = self._get_channel_id(channel)
455-
request = self.youtube_client.search().list(
466+
uploads_playlist_id = self._get_uploads_playlist_id(channel_id)
467+
request = self.youtube_client.playlistItems().list(
456468
part="id,snippet",
457-
channelId=channel_id,
458-
maxResults=50, # adjust this value to retrieve more or fewer videos
469+
playlistId=uploads_playlist_id,
470+
maxResults=50,
459471
)
460472
video_ids = []
461473
while request is not None:
462474
response = request.execute()
463475

464476
# Add each video ID to the list
465477
for item in response["items"]:
466-
if not item["id"].get("videoId"):
467-
continue
468-
meta_data = {"videoId": item["id"]["videoId"]}
478+
video_id = item["snippet"]["resourceId"]["videoId"]
479+
meta_data = {"videoId": video_id}
469480
if self.add_video_info:
470481
item["snippet"].pop("thumbnails")
471482
meta_data.update(item["snippet"])
472483
try:
473-
page_content = self._get_transcripe_for_video_id(
474-
item["id"]["videoId"]
475-
)
484+
page_content = self._get_transcripe_for_video_id(video_id)
476485
video_ids.append(
477486
Document(
478487
page_content=page_content,
479488
metadata=meta_data,
480489
)
481490
)
482-
except (TranscriptsDisabled, NoTranscriptFound) as e:
491+
except (TranscriptsDisabled, NoTranscriptFound, ParseError) as e:
483492
if self.continue_on_failure:
484493
logger.error(
485494
"Error fetching transscript "

libs/community/scripts/lint_imports.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ fi
2929
# is very nuanced and depends on the user's environment.
3030
# https://docs.python.org/3/library/xml.etree.elementtree.html
3131

32-
result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in" || true)
32+
result=$(git -C "$repository_path" grep -En '^from xml.|^import xml$|^import xml.' | grep -vE "# OK: user-must-opt-in| # OK: trusted-source" || true)
3333

3434
if [ -n "$result" ]; then
3535
echo "ERROR: The following lines need to be updated:"

0 commit comments

Comments
 (0)