如何用 Scrapy 处理 manytomany 字段
How to handle manytomany field with Scrapy
我想在 Django 中使用 Scrapy。
我的目标是link actors 字段到name 字段,但我不知道如何处理Django manytomany。我的数据库是 MySQL(我没有使用 djangoItem)。
models.py
class Movies(models.Model):
content_ID = models.CharField(max_length=30)
release_date = models.CharField(max_length=30)
running_time = models.CharField(max_length=10)
actors = models.CharField(max_length=300)
series = models.CharField(max_length=30)
director = models.CharField(max_length=30)
label = models.CharField(max_length=30)
image_urls = models.CharField(max_length=200, null=True)
images = models.TextField(null=True)
image_paths = models.TextField(null=True)
def __str__(self):
return self.content_ID
class Actors(models.Model):
names = models.CharField(max_length=100, null=True)
movielist = models.ManyToManyField(EnMovielist)
image_urls = models.CharField(max_length=200)
images = models.TextField(null=True)
image_paths = models.TextField(null=True)
def __str__(self):
return self.name
https://github.com/DevProfi/scrapy-djangoitem
处理 scrapy 我使用 pipeline
class ItemPersistencePipeline(object):
def process_item(self, item, spider, partial=True):
try:
item_model = item_to_model(item)
except TypeError:
return item
model, created = get_or_create(item_model, spider.unique_fields)
# Если объект модели не создана значит она уже есть и нужно обновить ее
if not created:
try:
update_model(destination=model, source=item_model, item=item, fields=spider.unique_fields, partial=partial)
except Exception as e:
return e
# Объект модели создан, нужно создать m2m объекты для нее если существуют
else:
item_fields_m2m = sorted(item._model_fields_m2m)
for f in item_fields_m2m:
val = item.get(f)
if val:
getattr(model, f).set(val)
# TODO add bulk insert model fields
# model.related_set.set(new_list)
return item
def update_model(destination, source, item, fields, partial, commit=False):
# partial включено ли частичное обновление
# commit испольщзуется для анализа изменился ли объект чтобы зря не сохранять его в базу
pk = destination.pk
opts = source._meta
fields_m2m = sorted(opts.many_to_many)
field_names_m2m = [f.name for f in fields_m2m]
source_fields = fields_for_model(source, exclude=field_names_m2m)
for key in source_fields.keys():
# if key != 'name':
val_old = getattr(destination, key)
t = type(val_old)
try:
val_new = (getattr(source, key))
except ObjectDoesNotExist:
continue
if partial:
if val_new:
if val_new != val_old:
setattr(destination, key, val_new)
commit = True
else:
commit = True
setattr(destination, key, val_new)
if not pk:
setattr(destination, 'pk', pk)
if commit:
destination.save()
# TODO fix for update m2m fields with list
item_fields_m2m = sorted(item._model_fields_m2m)
for f in item_fields_m2m:
val_new = item.get(f)
val_old = list(getattr(destination, f).all())
if val_new and (val_new not in val_old):
getattr(destination, f).add(val_new)
return destination
我想在 Django 中使用 Scrapy。
我的目标是link actors 字段到name 字段,但我不知道如何处理Django manytomany。我的数据库是 MySQL(我没有使用 djangoItem)。
models.py
class Movies(models.Model):
content_ID = models.CharField(max_length=30)
release_date = models.CharField(max_length=30)
running_time = models.CharField(max_length=10)
actors = models.CharField(max_length=300)
series = models.CharField(max_length=30)
director = models.CharField(max_length=30)
label = models.CharField(max_length=30)
image_urls = models.CharField(max_length=200, null=True)
images = models.TextField(null=True)
image_paths = models.TextField(null=True)
def __str__(self):
return self.content_ID
class Actors(models.Model):
names = models.CharField(max_length=100, null=True)
movielist = models.ManyToManyField(EnMovielist)
image_urls = models.CharField(max_length=200)
images = models.TextField(null=True)
image_paths = models.TextField(null=True)
def __str__(self):
return self.name
https://github.com/DevProfi/scrapy-djangoitem 处理 scrapy 我使用 pipeline
class ItemPersistencePipeline(object):
def process_item(self, item, spider, partial=True):
try:
item_model = item_to_model(item)
except TypeError:
return item
model, created = get_or_create(item_model, spider.unique_fields)
# Если объект модели не создана значит она уже есть и нужно обновить ее
if not created:
try:
update_model(destination=model, source=item_model, item=item, fields=spider.unique_fields, partial=partial)
except Exception as e:
return e
# Объект модели создан, нужно создать m2m объекты для нее если существуют
else:
item_fields_m2m = sorted(item._model_fields_m2m)
for f in item_fields_m2m:
val = item.get(f)
if val:
getattr(model, f).set(val)
# TODO add bulk insert model fields
# model.related_set.set(new_list)
return item
def update_model(destination, source, item, fields, partial, commit=False):
# partial включено ли частичное обновление
# commit испольщзуется для анализа изменился ли объект чтобы зря не сохранять его в базу
pk = destination.pk
opts = source._meta
fields_m2m = sorted(opts.many_to_many)
field_names_m2m = [f.name for f in fields_m2m]
source_fields = fields_for_model(source, exclude=field_names_m2m)
for key in source_fields.keys():
# if key != 'name':
val_old = getattr(destination, key)
t = type(val_old)
try:
val_new = (getattr(source, key))
except ObjectDoesNotExist:
continue
if partial:
if val_new:
if val_new != val_old:
setattr(destination, key, val_new)
commit = True
else:
commit = True
setattr(destination, key, val_new)
if not pk:
setattr(destination, 'pk', pk)
if commit:
destination.save()
# TODO fix for update m2m fields with list
item_fields_m2m = sorted(item._model_fields_m2m)
for f in item_fields_m2m:
val_new = item.get(f)
val_old = list(getattr(destination, f).all())
if val_new and (val_new not in val_old):
getattr(destination, f).add(val_new)
return destination