diff --git a/ExtensionCrawler/db.py b/ExtensionCrawler/db.py index 351589d..b105b90 100644 --- a/ExtensionCrawler/db.py +++ b/ExtensionCrawler/db.py @@ -146,8 +146,13 @@ def parse_and_insert_overview(ext_id, date, datepath, con): full_description = str( description_parent.parent) if description_parent else None - developer_parent = doc.find( + offeredby_parent = doc.find( class_=lambda cls: cls and "e-f-Me" in cls) + offeredby = "".join([str(x) for x in offeredby_parent.contents + ]) if offeredby_parent else None + + developer_parent = doc.find( + class_=lambda cls: cls and "C-b-p-rc-D-J" in cls) developer = "".join([str(x) for x in developer_parent.contents ]) if developer_parent else None @@ -163,6 +168,7 @@ def parse_and_insert_overview(ext_id, date, datepath, con): contents) itemcategory = match.group(1) if match else None + con.insert( "extension", extid=ext_id, @@ -174,6 +180,7 @@ def parse_and_insert_overview(ext_id, date, datepath, con): rating=rating, ratingcount=rating_count, fulldescription=full_description, + offeredby=offeredby, developer=developer, itemcategory=itemcategory, crx_etag=etag, diff --git a/database/schemas/extension.sql b/database/schemas/extension.sql index 2933dfb..89667f6 100644 --- a/database/schemas/extension.sql +++ b/database/schemas/extension.sql @@ -30,6 +30,7 @@ CREATE TABLE `extension` ( `rating` double DEFAULT NULL, `ratingcount` int(11) DEFAULT NULL, `fulldescription` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, + `offeredby` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, `developer` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, `itemcategory` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci DEFAULT NULL, diff --git a/database/views/extension_most_recent.sql b/database/views/extension_most_recent.sql index 400493e..bb822d1 100644 --- a/database/views/extension_most_recent.sql +++ b/database/views/extension_most_recent.sql @@ -27,7 +27,7 @@ /*!50001 SET collation_connection = utf8_general_ci */; /*!50001 CREATE ALGORITHM=UNDEFINED */ /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */ -/*!50001 VIEW `extension_most_recent` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */; +/*!50001 VIEW `extension_most_recent` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`offeredby` AS `offeredby`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */; /*!50001 SET character_set_client = @saved_cs_client */; /*!50001 SET character_set_results = @saved_cs_results */; /*!50001 SET collation_connection = @saved_col_connection */; diff --git a/database/views/extension_most_recent_small.sql b/database/views/extension_most_recent_small.sql index 35bf58a..0a6a15f 100644 --- a/database/views/extension_most_recent_small.sql +++ b/database/views/extension_most_recent_small.sql @@ -27,7 +27,7 @@ /*!50001 SET collation_connection = utf8_general_ci */; /*!50001 CREATE ALGORITHM=UNDEFINED */ /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */ -/*!50001 VIEW `extension_most_recent_small` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` where `e1`.`extid` like 'aa%' group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */; +/*!50001 VIEW `extension_most_recent_small` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`offeredby` AS `offeredby`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` where `e1`.`extid` like 'aa%' group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */; /*!50001 SET character_set_client = @saved_cs_client */; /*!50001 SET character_set_results = @saved_cs_results */; /*!50001 SET collation_connection = @saved_col_connection */; diff --git a/database/views/extension_most_recent_until_date.sql b/database/views/extension_most_recent_until_date.sql index d6796e6..e11137f 100644 --- a/database/views/extension_most_recent_until_date.sql +++ b/database/views/extension_most_recent_until_date.sql @@ -30,7 +30,7 @@ create function until_date returns datetime NO SQL DEERMINISTIC return @until_da /*!50001 SET collation_connection = utf8_general_ci */; /*!50001 CREATE ALGORITHM=UNDEFINED */ /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */ -/*!50001 VIEW `extension_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; +/*!50001 VIEW `extension_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`, `extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; /*!50001 SET character_set_client = @saved_cs_client */; /*!50001 SET character_set_results = @saved_cs_results */; /*!50001 SET collation_connection = @saved_col_connection */; diff --git a/database/views/extension_second_most_recent.sql b/database/views/extension_second_most_recent.sql index d652267..d8086ed 100644 --- a/database/views/extension_second_most_recent.sql +++ b/database/views/extension_second_most_recent.sql @@ -27,7 +27,7 @@ /*!50001 SET collation_connection = utf8_general_ci */; /*!50001 CREATE ALGORITHM=UNDEFINED */ /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */ -/*!50001 VIEW `extension_second_most_recent` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; +/*!50001 VIEW `extension_second_most_recent` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`,`extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; /*!50001 SET character_set_client = @saved_cs_client */; /*!50001 SET character_set_results = @saved_cs_results */; /*!50001 SET collation_connection = @saved_col_connection */; diff --git a/database/views/extension_second_most_recent_until_date.sql b/database/views/extension_second_most_recent_until_date.sql index 99ce39f..c2defba 100644 --- a/database/views/extension_second_most_recent_until_date.sql +++ b/database/views/extension_second_most_recent_until_date.sql @@ -30,7 +30,7 @@ create function until_date returns datetime NO SQL DEERMINISTIC return @until_da /*!50001 SET collation_connection = utf8_general_ci */; /*!50001 CREATE ALGORITHM=UNDEFINED */ /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */ -/*!50001 VIEW `extension_second_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() and !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; +/*!50001 VIEW `extension_second_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`,`extensions`.`extension`.`developer` AS `developer`, `extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() and !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; /*!50001 SET character_set_client = @saved_cs_client */; /*!50001 SET character_set_results = @saved_cs_results */; /*!50001 SET collation_connection = @saved_col_connection */; diff --git a/database/views/extension_small.sql b/database/views/extension_small.sql index e18b275..11102cb 100644 --- a/database/views/extension_small.sql +++ b/database/views/extension_small.sql @@ -27,7 +27,7 @@ /*!50001 SET collation_connection = utf8_general_ci */; /*!50001 CREATE ALGORITHM=UNDEFINED */ /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */ -/*!50001 VIEW `extension_small` AS select `extension`.`extid` AS `extid`,`extension`.`date` AS `date`,`extension`.`name` AS `name`,`extension`.`version` AS `version`,`extension`.`description` AS `description`,`extension`.`downloads` AS `downloads`,`extension`.`rating` AS `rating`,`extension`.`ratingcount` AS `ratingcount`,`extension`.`fulldescription` AS `fulldescription`,`extension`.`developer` AS `developer`,`extension`.`itemcategory` AS `itemcategory`,`extension`.`crx_etag` AS `crx_etag`,`extension`.`lastupdated` AS `lastupdated` from `extension` where `extension`.`extid` like 'aa%' */; +/*!50001 VIEW `extension_small` AS select `extension`.`extid` AS `extid`,`extension`.`date` AS `date`,`extension`.`name` AS `name`,`extension`.`version` AS `version`,`extension`.`description` AS `description`,`extension`.`downloads` AS `downloads`,`extension`.`rating` AS `rating`,`extension`.`ratingcount` AS `ratingcount`,`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`, `extension`.`developer` AS `developer`,`extension`.`itemcategory` AS `itemcategory`,`extension`.`crx_etag` AS `crx_etag`,`extension`.`lastupdated` AS `lastupdated` from `extension` where `extension`.`extid` like 'aa%' */; /*!50001 SET character_set_client = @saved_cs_client */; /*!50001 SET character_set_results = @saved_cs_results */; /*!50001 SET collation_connection = @saved_col_connection */;