Scraping IMDb episodes using Cheerio.js - only first page of TV episodes is returned
Working on scraping TV episodes from IMDb (Breaking Bad in the example below). The problem is when implementing the for
loop, only the first iteration of j
is returned.
My assumption is the return statement is exiting the loop but I'm unsure how to fix the problem.
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
const episodes = ;
for (let j = 1; j <= numSeasons; j++) {
return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season: j,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes; //Only season 1 is returned.
});
}
}
javascript node.js cheerio imdb
add a comment |
Working on scraping TV episodes from IMDb (Breaking Bad in the example below). The problem is when implementing the for
loop, only the first iteration of j
is returned.
My assumption is the return statement is exiting the loop but I'm unsure how to fix the problem.
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
const episodes = ;
for (let j = 1; j <= numSeasons; j++) {
return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season: j,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes; //Only season 1 is returned.
});
}
}
javascript node.js cheerio imdb
add a comment |
Working on scraping TV episodes from IMDb (Breaking Bad in the example below). The problem is when implementing the for
loop, only the first iteration of j
is returned.
My assumption is the return statement is exiting the loop but I'm unsure how to fix the problem.
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
const episodes = ;
for (let j = 1; j <= numSeasons; j++) {
return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season: j,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes; //Only season 1 is returned.
});
}
}
javascript node.js cheerio imdb
Working on scraping TV episodes from IMDb (Breaking Bad in the example below). The problem is when implementing the for
loop, only the first iteration of j
is returned.
My assumption is the return statement is exiting the loop but I'm unsure how to fix the problem.
const fetch = require('node-fetch');
const cheerio = require('cheerio');
const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
const episodes = ;
for (let j = 1; j <= numSeasons; j++) {
return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
.then(response => response.text())
.then(body => {
const $ = cheerio.load(body);
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season: j,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes; //Only season 1 is returned.
});
}
}
javascript node.js cheerio imdb
javascript node.js cheerio imdb
asked Nov 20 '18 at 22:05
Matthew SnellMatthew Snell
189316
189316
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
Let's rewrite the function using async await style. This way we make sure we fire fetch
numSeasons
times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = ;
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = ;
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, );
}
Thanks! I'm getting an empty array when running the code above. ThegetEpisodes
is iteratingnumSeasons
times but theprocessResponse
doesn't seem to be correctly fetching any data (getting 5 empty arrays). Thoughts?
– Matthew Snell
Nov 21 '18 at 1:24
oh I just noticed response.text() returns a Promise too, so we need to makeprocessResponse
async, andawait response.text()
. Check my edited code
– jamesjaya
Nov 21 '18 at 7:51
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53402288%2fscraping-imdb-episodes-using-cheerio-js-only-first-page-of-tv-episodes-is-retu%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
Let's rewrite the function using async await style. This way we make sure we fire fetch
numSeasons
times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = ;
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = ;
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, );
}
Thanks! I'm getting an empty array when running the code above. ThegetEpisodes
is iteratingnumSeasons
times but theprocessResponse
doesn't seem to be correctly fetching any data (getting 5 empty arrays). Thoughts?
– Matthew Snell
Nov 21 '18 at 1:24
oh I just noticed response.text() returns a Promise too, so we need to makeprocessResponse
async, andawait response.text()
. Check my edited code
– jamesjaya
Nov 21 '18 at 7:51
add a comment |
Let's rewrite the function using async await style. This way we make sure we fire fetch
numSeasons
times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = ;
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = ;
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, );
}
Thanks! I'm getting an empty array when running the code above. ThegetEpisodes
is iteratingnumSeasons
times but theprocessResponse
doesn't seem to be correctly fetching any data (getting 5 empty arrays). Thoughts?
– Matthew Snell
Nov 21 '18 at 1:24
oh I just noticed response.text() returns a Promise too, so we need to makeprocessResponse
async, andawait response.text()
. Check my edited code
– jamesjaya
Nov 21 '18 at 7:51
add a comment |
Let's rewrite the function using async await style. This way we make sure we fire fetch
numSeasons
times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = ;
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = ;
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, );
}
Let's rewrite the function using async await style. This way we make sure we fire fetch
numSeasons
times, await all of them, and process them one by one.
async function processResponse(response, season) {
const body = await response.text();
const $ = cheerio.load(body);
let episodes = ;
$('div[itemProp="episodes"]').each(function (i, element) {
const airdate = $(element).find('.airdate').text().trim();
const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/(([^)]+))/)[1];
const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
episode = {
season,
episodeTitle,
airdate,
votes,
rating
};
episodes.push(episode);
});
return episodes;
}
async function getEpisodes(searchTerm) {
//const imdbID = await getID(searchTerm);
//const numSeasons = await getSeasons(imdbID);
const imdbID = 'tt0903747';
const numSeasons = 5;
let promises = ;
for (let j = 1; j <= numSeasons; j++) {
promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
}
const responses = await Promise.all(promises);
return responses.reduce((accumulator, response, index) => {
return accumulator.concat(await processResponse(response, index + 1));
}, );
}
edited Nov 21 '18 at 7:49
answered Nov 20 '18 at 23:29
jamesjayajamesjaya
657412
657412
Thanks! I'm getting an empty array when running the code above. ThegetEpisodes
is iteratingnumSeasons
times but theprocessResponse
doesn't seem to be correctly fetching any data (getting 5 empty arrays). Thoughts?
– Matthew Snell
Nov 21 '18 at 1:24
oh I just noticed response.text() returns a Promise too, so we need to makeprocessResponse
async, andawait response.text()
. Check my edited code
– jamesjaya
Nov 21 '18 at 7:51
add a comment |
Thanks! I'm getting an empty array when running the code above. ThegetEpisodes
is iteratingnumSeasons
times but theprocessResponse
doesn't seem to be correctly fetching any data (getting 5 empty arrays). Thoughts?
– Matthew Snell
Nov 21 '18 at 1:24
oh I just noticed response.text() returns a Promise too, so we need to makeprocessResponse
async, andawait response.text()
. Check my edited code
– jamesjaya
Nov 21 '18 at 7:51
Thanks! I'm getting an empty array when running the code above. The
getEpisodes
is iterating numSeasons
times but the processResponse
doesn't seem to be correctly fetching any data (getting 5 empty arrays). Thoughts?– Matthew Snell
Nov 21 '18 at 1:24
Thanks! I'm getting an empty array when running the code above. The
getEpisodes
is iterating numSeasons
times but the processResponse
doesn't seem to be correctly fetching any data (getting 5 empty arrays). Thoughts?– Matthew Snell
Nov 21 '18 at 1:24
oh I just noticed response.text() returns a Promise too, so we need to make
processResponse
async, and await response.text()
. Check my edited code– jamesjaya
Nov 21 '18 at 7:51
oh I just noticed response.text() returns a Promise too, so we need to make
processResponse
async, and await response.text()
. Check my edited code– jamesjaya
Nov 21 '18 at 7:51
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53402288%2fscraping-imdb-episodes-using-cheerio-js-only-first-page-of-tv-episodes-is-retu%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown