Scrape multilevel menu using Scrapy 1.5
I am trying to get all links from multilevel menu.
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
import scrapy
from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BbcSpider(CrawlSpider):
name = 'bbc'
allowed_domains = ['bbcgoodfood.com']
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
)
def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
Results of menu scraping
But I cant understand how populate empty first column before collection title.
For now I have:
EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
But I need:
Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
Can somebody give me advise what need to do to get result with subcategory in first column?
Thanks to everyone)
python scrapy scrapy-spider
add a comment |
I am trying to get all links from multilevel menu.
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
import scrapy
from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BbcSpider(CrawlSpider):
name = 'bbc'
allowed_domains = ['bbcgoodfood.com']
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
)
def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
Results of menu scraping
But I cant understand how populate empty first column before collection title.
For now I have:
EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
But I need:
Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
Can somebody give me advise what need to do to get result with subcategory in first column?
Thanks to everyone)
python scrapy scrapy-spider
add a comment |
I am trying to get all links from multilevel menu.
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
import scrapy
from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BbcSpider(CrawlSpider):
name = 'bbc'
allowed_domains = ['bbcgoodfood.com']
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
)
def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
Results of menu scraping
But I cant understand how populate empty first column before collection title.
For now I have:
EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
But I need:
Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
Can somebody give me advise what need to do to get result with subcategory in first column?
Thanks to everyone)
python scrapy scrapy-spider
I am trying to get all links from multilevel menu.
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
import scrapy
from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BbcSpider(CrawlSpider):
name = 'bbc'
allowed_domains = ['bbcgoodfood.com']
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
)
def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
Results of menu scraping
But I cant understand how populate empty first column before collection title.
For now I have:
EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
But I need:
Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
Can somebody give me advise what need to do to get result with subcategory in first column?
Thanks to everyone)
python scrapy scrapy-spider
python scrapy scrapy-spider
edited Nov 21 '18 at 16:44
stranac
14.5k31725
14.5k31725
asked Nov 21 '18 at 14:20
rusromrusrom
83
83
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).
The usual way to do this is documented in Passing additional data to callback functions.
You would extract the category in your first callback, and then create a new request passing this information in the meta dict.
Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.
– rusrom
Nov 21 '18 at 17:38
Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwritingCrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)
– stranac
Nov 21 '18 at 18:12
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53414127%2fscrape-multilevel-menu-using-scrapy-1-5%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).
The usual way to do this is documented in Passing additional data to callback functions.
You would extract the category in your first callback, and then create a new request passing this information in the meta dict.
Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.
– rusrom
Nov 21 '18 at 17:38
Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwritingCrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)
– stranac
Nov 21 '18 at 18:12
add a comment |
What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).
The usual way to do this is documented in Passing additional data to callback functions.
You would extract the category in your first callback, and then create a new request passing this information in the meta dict.
Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.
– rusrom
Nov 21 '18 at 17:38
Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwritingCrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)
– stranac
Nov 21 '18 at 18:12
add a comment |
What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).
The usual way to do this is documented in Passing additional data to callback functions.
You would extract the category in your first callback, and then create a new request passing this information in the meta dict.
What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).
The usual way to do this is documented in Passing additional data to callback functions.
You would extract the category in your first callback, and then create a new request passing this information in the meta dict.
answered Nov 21 '18 at 17:01
stranacstranac
14.5k31725
14.5k31725
Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.
– rusrom
Nov 21 '18 at 17:38
Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwritingCrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)
– stranac
Nov 21 '18 at 18:12
add a comment |
Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.
– rusrom
Nov 21 '18 at 17:38
Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwritingCrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)
– stranac
Nov 21 '18 at 18:12
Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.
– rusrom
Nov 21 '18 at 17:38
Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.
– rusrom
Nov 21 '18 at 17:38
Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting
CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)– stranac
Nov 21 '18 at 18:12
Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting
CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)– stranac
Nov 21 '18 at 18:12
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53414127%2fscrape-multilevel-menu-using-scrapy-1-5%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown