Scrape multilevel menu using Scrapy 1.5












1















I am trying to get all links from multilevel menu.

start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']



import scrapy

from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst


class BbcSpider(CrawlSpider):

name = 'bbc'
allowed_domains = ['bbcgoodfood.com']

start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']

rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
)

def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)

l.default_output_processor = TakeFirst()

l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)

yield l.load_item()

def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)

l.default_output_processor = TakeFirst()

l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
l.add_value('page_url', response.url)

yield l.load_item()


Results of menu scraping
But I cant understand how populate empty first column before collection title.



For now I have:



EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak



But I need:



Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak



Can somebody give me advise what need to do to get result with subcategory in first column?



Thanks to everyone)










share|improve this question





























    1















    I am trying to get all links from multilevel menu.

    start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']



    import scrapy

    from foodisgood.items import FoodisgoodItem
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy.linkextractors import LinkExtractor
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import TakeFirst


    class BbcSpider(CrawlSpider):

    name = 'bbc'
    allowed_domains = ['bbcgoodfood.com']

    start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']

    rules = (
    Rule(LinkExtractor(allow=(r'/recipes/category/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
    Rule(LinkExtractor(allow=(r'/recipes/collection/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
    )

    def parse_sub_categories(self, response):
    l = ItemLoader(item=FoodisgoodItem(), response=response)

    l.default_output_processor = TakeFirst()

    l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
    l.add_value('page_url', response.url)

    yield l.load_item()

    def parse_collections(self, response):
    l = ItemLoader(item=FoodisgoodItem(), response=response)

    l.default_output_processor = TakeFirst()

    l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
    l.add_value('page_url', response.url)

    yield l.load_item()


    Results of menu scraping
    But I cant understand how populate empty first column before collection title.



    For now I have:



    EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak



    But I need:



    Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak



    Can somebody give me advise what need to do to get result with subcategory in first column?



    Thanks to everyone)










    share|improve this question



























      1












      1








      1








      I am trying to get all links from multilevel menu.

      start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']



      import scrapy

      from foodisgood.items import FoodisgoodItem
      from scrapy.spiders import CrawlSpider, Rule
      from scrapy.linkextractors import LinkExtractor
      from scrapy.loader import ItemLoader
      from scrapy.loader.processors import TakeFirst


      class BbcSpider(CrawlSpider):

      name = 'bbc'
      allowed_domains = ['bbcgoodfood.com']

      start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']

      rules = (
      Rule(LinkExtractor(allow=(r'/recipes/category/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
      Rule(LinkExtractor(allow=(r'/recipes/collection/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
      )

      def parse_sub_categories(self, response):
      l = ItemLoader(item=FoodisgoodItem(), response=response)

      l.default_output_processor = TakeFirst()

      l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
      l.add_value('page_url', response.url)

      yield l.load_item()

      def parse_collections(self, response):
      l = ItemLoader(item=FoodisgoodItem(), response=response)

      l.default_output_processor = TakeFirst()

      l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
      l.add_value('page_url', response.url)

      yield l.load_item()


      Results of menu scraping
      But I cant understand how populate empty first column before collection title.



      For now I have:



      EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak



      But I need:



      Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak



      Can somebody give me advise what need to do to get result with subcategory in first column?



      Thanks to everyone)










      share|improve this question
















      I am trying to get all links from multilevel menu.

      start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']



      import scrapy

      from foodisgood.items import FoodisgoodItem
      from scrapy.spiders import CrawlSpider, Rule
      from scrapy.linkextractors import LinkExtractor
      from scrapy.loader import ItemLoader
      from scrapy.loader.processors import TakeFirst


      class BbcSpider(CrawlSpider):

      name = 'bbc'
      allowed_domains = ['bbcgoodfood.com']

      start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']

      rules = (
      Rule(LinkExtractor(allow=(r'/recipes/category/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
      Rule(LinkExtractor(allow=(r'/recipes/collection/[w-]+$'), restrict_xpaths='//article[contains(@class, "cleargridindent")]'), callback='parse_collections', follow=True),
      )

      def parse_sub_categories(self, response):
      l = ItemLoader(item=FoodisgoodItem(), response=response)

      l.default_output_processor = TakeFirst()

      l.add_xpath('category_title', '//h1[@class="section-head--title"]/text()')
      l.add_value('page_url', response.url)

      yield l.load_item()

      def parse_collections(self, response):
      l = ItemLoader(item=FoodisgoodItem(), response=response)

      l.default_output_processor = TakeFirst()

      l.add_xpath('collection_title', '//h1[@class="section-head--title"]/text()')
      l.add_value('page_url', response.url)

      yield l.load_item()


      Results of menu scraping
      But I cant understand how populate empty first column before collection title.



      For now I have:



      EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak



      But I need:



      Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak



      Can somebody give me advise what need to do to get result with subcategory in first column?



      Thanks to everyone)







      python scrapy scrapy-spider






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Nov 21 '18 at 16:44









      stranac

      14.5k31725




      14.5k31725










      asked Nov 21 '18 at 14:20









      rusromrusrom

      83




      83
























          1 Answer
          1






          active

          oldest

          votes


















          0














          What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).



          The usual way to do this is documented in Passing additional data to callback functions.

          You would extract the category in your first callback, and then create a new request passing this information in the meta dict.






          share|improve this answer
























          • Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.

            – rusrom
            Nov 21 '18 at 17:38











          • Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)

            – stranac
            Nov 21 '18 at 18:12











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53414127%2fscrape-multilevel-menu-using-scrapy-1-5%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          1 Answer
          1






          active

          oldest

          votes








          1 Answer
          1






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes









          0














          What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).



          The usual way to do this is documented in Passing additional data to callback functions.

          You would extract the category in your first callback, and then create a new request passing this information in the meta dict.






          share|improve this answer
























          • Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.

            – rusrom
            Nov 21 '18 at 17:38











          • Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)

            – stranac
            Nov 21 '18 at 18:12
















          0














          What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).



          The usual way to do this is documented in Passing additional data to callback functions.

          You would extract the category in your first callback, and then create a new request passing this information in the meta dict.






          share|improve this answer
























          • Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.

            – rusrom
            Nov 21 '18 at 17:38











          • Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)

            – stranac
            Nov 21 '18 at 18:12














          0












          0








          0







          What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).



          The usual way to do this is documented in Passing additional data to callback functions.

          You would extract the category in your first callback, and then create a new request passing this information in the meta dict.






          share|improve this answer













          What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).



          The usual way to do this is documented in Passing additional data to callback functions.

          You would extract the category in your first callback, and then create a new request passing this information in the meta dict.







          share|improve this answer












          share|improve this answer



          share|improve this answer










          answered Nov 21 '18 at 17:01









          stranacstranac

          14.5k31725




          14.5k31725













          • Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.

            – rusrom
            Nov 21 '18 at 17:38











          • Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)

            – stranac
            Nov 21 '18 at 18:12



















          • Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.

            – rusrom
            Nov 21 '18 at 17:38











          • Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)

            – stranac
            Nov 21 '18 at 18:12

















          Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.

          – rusrom
          Nov 21 '18 at 17:38





          Thank you for help) I use meta in requests, but here I try to use Rule. I dont understand where I need to pass meta. Can you give some code example.

          – rusrom
          Nov 21 '18 at 17:38













          Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)

          – stranac
          Nov 21 '18 at 18:12





          Unfortunately, your callback and the rule's request creation are completely separated, so you can't do this using rules (at least without overwriting CrawlSpider's private methods). You will have to find links and generate requests manually (just like shown in the linked docs)

          – stranac
          Nov 21 '18 at 18:12




















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Stack Overflow!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53414127%2fscrape-multilevel-menu-using-scrapy-1-5%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          這個網誌中的熱門文章

          Academy of Television Arts & Sciences

          L'Équipe

          1995 France bombings