太长不看版
# loader需要传入response
# ArticleItemLoader(item=ArticleItem, response=response)
def urljoin_with_context(url, loader_context):
"""拼接images的url"""
# loader_context能够接收传入的上下文,而默认的loader_context中带有response
response = loader_context.get('response')
return [urljoin(response.url, u) for u in url]
class ArticleItemLoader(ItemLoader):
images_out = Compose(urljoin_with_context, Join(";"))
背景
由于需要再ltemloader对图片连接进行补全,需要response.url属性,但又没找到能传入和接收到response的地方
但看到实例化Itemloader的时候传入了response的值,觉得应该可以通过某种方式调用到
本文主角 Item Loader Context
前置介绍
https://docs.scrapy.org/en/latest/topics/loaders.html#item-loader-context
源码分析
scrapy.loader.init.py
我们定义的itemloader一般都是继承这个类的,可以看到response传入到了context中
class ItemLoader(itemloaders.ItemLoader):
default_item_class = Item
default_selector_class = Selector
def __init__(self, item=None, selector=None, response=None, parent=None, **context):
if selector is None and response is not None:
try:
selector = self.default_selector_class(response)
except AttributeError:
selector = None
context.update(response=response)
super().__init__(item=item, selector=selector, parent=parent, **context)
scrapy的loader则是继承itemloaders包
class ItemLoader:
...
def __init__(self, item=None, selector=None, parent=None, **context):
self.selector = selector
context.update(selector=selector)
if item is None:
item = self.default_item_class()
self._local_item = item
context['item'] = item
self.context = context
self.parent = parent
self._local_values = {}
# values from initial item
for field_name, value in ItemAdapter(item).items():
self._values.setdefault(field_name, [])
self._values[field_name] += arg_to_iter(value)
这里可以看到context
def get_output_value(self, field_name):
"""
Return the collected values parsed using the output processor, for the
given field. This method doesn't populate or modify the item at all.
"""
# 获取itemloader中定义的输出处理器(xxx_out定义的)
proc = self.get_output_processor(field_name)
# 包装context,可以看到对我们的处理器传入了context
proc = wrap_loader_context(proc, self.context)
value = self._values.get(field_name, [])
try:
return proc(value)
except Exception as e:
raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" %
(field_name, value, type(e).__name__, str(e)))
在get_output_value中调试跟踪可以看到我们处理的字段和定义的输出处理器,重点在wrap_loader_context对处理器进行的包装,点进去可以看到wrap_loader_context的具体方法
def wrap_loader_context(function, context):
"""Wrap functions that receive loader_context to contain the context
"pre-loaded" and expose a interface that receives only one argument
"""
if 'loader_context' in get_func_args(function):
return partial(function, loader_context=context)
else:
return function
这里会判断loader_context这个参数是否会在我们定义的方法中,如何我们定义的方法有这个参数,则会传入context
compose
class Compose:
def __init__(self, *functions, **default_loader_context):
self.functions = functions
self.stop_on_none = default_loader_context.get('stop_on_none', True)
self.default_loader_context = default_loader_context
def __call__(self, value, loader_context=None):
if loader_context:
context = ChainMap(loader_context, self.default_loader_context)
else:
context = self.default_loader_context
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
for func in wrapped_funcs:
if value is None and self.stop_on_none:
break
try:
value = func(value)
except Exception as e:
raise ValueError("Error in Compose with "
"%s value=%r error='%s: %s'" %
(str(func), value, type(e).__name__, str(e)))
return value
在回过头来看compose, 这里可以看到这里的call已经带有loader_context参数所以接收到了loader_context的参数的,debug进来的这里的loader_context里应该包含response, selector, item,在通过wrap_loader_context对传入的functions进行了包装使其能够接收context,所以我们在自定义的函数中只要定义了loader_context即可接收到context,MapCompose同理