From 031d4de1766ac4586857960a953a6728f3359471 Mon Sep 17 00:00:00 2001 From: goulven Date: Fri, 1 Nov 2024 15:51:05 +0100 Subject: [PATCH] Handling icecat PDF's bugs --- .../api/services/CompletionFacadeService.java | 2 +- .../realtime/MediaAggregationService.java | 14 +++ .../completion/IcecatCompletionService.java | 88 ++++++++++++------- 3 files changed, 72 insertions(+), 32 deletions(-) diff --git a/api/src/main/java/org/open4goods/api/services/CompletionFacadeService.java b/api/src/main/java/org/open4goods/api/services/CompletionFacadeService.java index 1717c72d1..eb2d19da0 100644 --- a/api/src/main/java/org/open4goods/api/services/CompletionFacadeService.java +++ b/api/src/main/java/org/open4goods/api/services/CompletionFacadeService.java @@ -66,7 +66,7 @@ public void amazonCompletionAll() throws InvalidParameterException, IOException // Icecat completion /////////////////////////////////// public void icecatCompletionAll() throws InvalidParameterException, IOException { - logger.warn("Completing verticals with amazon"); + logger.warn("Completing verticals with icecat"); icecatCompletionService.completeAll(true); } diff --git a/api/src/main/java/org/open4goods/api/services/aggregation/services/realtime/MediaAggregationService.java b/api/src/main/java/org/open4goods/api/services/aggregation/services/realtime/MediaAggregationService.java index 5e17989e8..d6c875cc0 100644 --- a/api/src/main/java/org/open4goods/api/services/aggregation/services/realtime/MediaAggregationService.java +++ b/api/src/main/java/org/open4goods/api/services/aggregation/services/realtime/MediaAggregationService.java @@ -72,6 +72,9 @@ public Map onDataFragment(final DataFragment input, final Produ output.getResources().add(r); } } + + onProduct(output, vConf); + return null; } @@ -83,6 +86,17 @@ public void close() throws IOException { @Override public void onProduct(Product data, VerticalConfig vConf) throws AggregationSkipException { + + // TODO(p1, perf) : Remove when sure there are no more protected urls + // We clean icecat protected items + data.getResources().removeIf(e -> { + if (e.getUrl().contains("icecat.biz") && e.getUrl().contains("?access")) { + logger.error("Removing icecat protected url : {}",e.getUrl()); + return true; + } else { + return false; + } + }); } diff --git a/api/src/main/java/org/open4goods/api/services/completion/IcecatCompletionService.java b/api/src/main/java/org/open4goods/api/services/completion/IcecatCompletionService.java index 98de862b7..99e7c2c6b 100644 --- a/api/src/main/java/org/open4goods/api/services/completion/IcecatCompletionService.java +++ b/api/src/main/java/org/open4goods/api/services/completion/IcecatCompletionService.java @@ -101,6 +101,10 @@ public void processProduct(VerticalConfig vertical, Product data) { logger.error("Error occurs during icecat aggregation",e); } } + + + + try { @@ -161,10 +165,10 @@ private Set completeSearch(VerticalConfig vertical, Product data) private DataFragment convert(IceDataItem iceItem, Product data) { DataFragment df = initDataFragment(data); - completeGeneralInfos(iceItem.generalInfo, df); - completeImage(iceItem.image, df); - completeMultimedia(iceItem.multimedia,df); - completeGallery(iceItem.gallery,df); + completeGeneralInfos(iceItem.generalInfo, df,data); + completeImage(iceItem.image, df, data); + completeMultimedia(iceItem.multimedia,df,data); + completeGallery(iceItem.gallery,df,data); completeFeaturesGroup(iceItem.featuresGroups,df); @@ -207,63 +211,88 @@ private void completeFeaturesGroup(List featuresGroups, DataFrag } - private void completeGallery(List gallery, DataFragment df) { + private void completeGallery(List gallery, DataFragment df, Product p) { for (Gallery g : gallery) { try { - // TODO : mutualize tag - df.addResource(g.pic , Sets.newHashSet(g.type,"gallery")); + addResourceIfAbsent(df, p, g.pic, g.type); } catch (ValidationException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + logger.warn("Error while adding resource {}",g.pic); } } } - private void completeMultimedia(List multimedia, DataFragment df) { + /** + * Adds the icecat only if not already done, filtering on the icecat completion token + * @param df + * @param p + * @param g + * @throws ValidationException + */ + private void addResourceIfAbsent(DataFragment df, Product p, String url, String tag) throws ValidationException { + + + String shortened = null; + int marker = url.indexOf("?access"); + if (marker != -1) { + // TODO(P1,design) : Remove when tested + logger.error("Got an access protected resource from icecat : {} - {}",url,p ); + shortened = url.substring(0,marker); + } + + if (null != shortened) { + + for (Resource r : p.getResources()) { + if (r.getUrl().startsWith(shortened)) { + if (r.isProcessed() == true && r.getFileSize() >0) { + logger.info("Resource have already been processed, skipping {}"); + return; + } + } + } + } + + df.addResource(url , Sets.newHashSet(tag,"gallery")); + } + + private void completeMultimedia(List multimedia, DataFragment df, Product p) { for (Multimedia m : multimedia) { try { // TODO : handle i18 - df.addResource(m.url , Sets.newHashSet(m.type,"fr")); + addResourceIfAbsent(df, p, m.url, "fr"); } catch (ValidationException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + logger.info("Cannot validate multimedia resource : {}",m.url); } } } - private void completeImage(Image image, DataFragment df) { + private void completeImage(Image image, DataFragment df, Product p) { try { // Tweak to exclude "brand" images sometimes used as logo if (!image.highPic.contains("brand")) { - Resource r = new Resource(image.highPic); - r.getHardTags().add(ResourceTag.PRIMARY); - df.addResource(r); - + addResourceIfAbsent(df, p, image.highPic, ResourceTag.PRIMARY.toString()); } } catch (ValidationException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + logger.info("Cannot validate image resource : {}",image.highPic); } } - private void completeGeneralInfos(GeneralInfo e, DataFragment df) { + private void completeGeneralInfos(GeneralInfo e, DataFragment df, Product p) { - // TODO : HAndle end of year / end of year + // TODO(p3, feature) : HAndle end of year / end of year if (null != e.releaseDate) { // TODO : i18n try { df.addAttribute("YEAR", e.releaseDate.substring(e.releaseDate.lastIndexOf("-")+1) , "fr", false, null); } catch (Exception e1) { - // TODO Auto-generated catch block - e1.printStackTrace(); + logger.error("Parsing year failed ! ",e); } } @@ -283,9 +312,7 @@ private void completeGeneralInfos(GeneralInfo e, DataFragment df) { try { if (e.description != null && e.description.leafletPDFURL != null) { - Resource r = new Resource(e.description.leafletPDFURL); - r.getHardTags().add(ResourceTag.LEAFLET); - df.addResource(r); + addResourceIfAbsent(df, p, e.description.leafletPDFURL, ResourceTag.LEAFLET.toString()); } } catch (ValidationException e1) { @@ -295,9 +322,8 @@ private void completeGeneralInfos(GeneralInfo e, DataFragment df) { try { if (e.description != null && e.description.manualPDFURL != null) { - Resource r = new Resource(e.description.manualPDFURL); - r.getHardTags().add(ResourceTag.MANUAL); - df.addResource(r); + addResourceIfAbsent(df, p, e.description.manualPDFURL, ResourceTag.MANUAL.toString()); + } } catch (ValidationException e1) { logger.error("Error while adding manual pdf {}", e.description.leafletPDFURL, e); @@ -318,7 +344,7 @@ private void completeGeneralInfos(GeneralInfo e, DataFragment df) { */ private DataFragment initDataFragment( Product data) { DataFragment df = new DataFragment(); - // TODO : Constants + // TODO(p3,conf) : Constants df.setDatasourceName("icecat.biz"); df.setDatasourceConfigName("icecat.biz.yml"); df.setLastIndexationDate(System.currentTimeMillis());