scan_data_processor.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. """
  2. Scan Data Processor Service
  3. This module handles validation and conversion of uploaded CloudShell scan data.
  4. It validates the JSON structure and converts the data to ScanResult objects
  5. compatible with the existing ReportGenerator.
  6. Requirements:
  7. - 4.2: Validate JSON structure completeness when receiving uploaded data
  8. - 5.1: Generate reports in the same format as existing scan tasks
  9. """
  10. from typing import Any, Dict, List, Tuple, Optional
  11. from datetime import datetime
  12. import logging
  13. from app.scanners.base import ResourceData, ScanResult
  14. logger = logging.getLogger(__name__)
  15. class ScanDataProcessor:
  16. """
  17. Processes uploaded CloudShell scan data.
  18. This class provides functionality to:
  19. - Validate the structure of uploaded JSON scan data
  20. - Convert validated data to ScanResult objects for report generation
  21. Requirements:
  22. - 4.2: Validate JSON structure completeness
  23. - 5.1: Convert to format compatible with existing ReportGenerator
  24. """
  25. # Required metadata fields based on design document ScanData interface
  26. REQUIRED_METADATA_FIELDS = [
  27. 'account_id',
  28. 'scan_timestamp',
  29. 'regions_scanned',
  30. 'services_scanned',
  31. ]
  32. # Optional metadata fields
  33. OPTIONAL_METADATA_FIELDS = [
  34. 'scanner_version',
  35. 'total_resources',
  36. 'total_errors',
  37. ]
  38. # Required top-level fields
  39. REQUIRED_TOP_LEVEL_FIELDS = [
  40. 'metadata',
  41. 'resources',
  42. 'errors',
  43. ]
  44. # Required resource fields based on ResourceData interface
  45. REQUIRED_RESOURCE_FIELDS = [
  46. 'account_id',
  47. 'region',
  48. 'service',
  49. 'resource_type',
  50. 'resource_id',
  51. 'name',
  52. ]
  53. # Required error fields based on ErrorData interface
  54. REQUIRED_ERROR_FIELDS = [
  55. 'service',
  56. 'region',
  57. 'error',
  58. 'error_type',
  59. ]
  60. def validate_scan_data(self, data: Dict[str, Any]) -> Tuple[bool, List[str]]:
  61. """
  62. Validate the structure of uploaded scan data.
  63. This method performs comprehensive validation of the JSON structure
  64. to ensure it conforms to the ScanData interface defined in the design.
  65. Args:
  66. data: Dictionary containing the uploaded scan data
  67. Returns:
  68. Tuple of (is_valid, error_messages):
  69. - is_valid: True if data is valid, False otherwise
  70. - error_messages: List of validation error messages (empty if valid)
  71. Requirements:
  72. - 4.2: Validate JSON structure completeness
  73. - 6.2: Return list of missing fields when validation fails
  74. Validates:
  75. - Property 2: JSON structure completeness (metadata, resources, errors)
  76. - Property 6: Returns all missing field names on validation failure
  77. """
  78. errors: List[str] = []
  79. # Check if data is a dictionary
  80. if not isinstance(data, dict):
  81. errors.append("Data must be a JSON object (dictionary)")
  82. return False, errors
  83. # Validate top-level fields
  84. missing_top_level = self._validate_required_fields(
  85. data,
  86. self.REQUIRED_TOP_LEVEL_FIELDS,
  87. "top-level"
  88. )
  89. errors.extend(missing_top_level)
  90. # If top-level fields are missing, we can't continue validation
  91. if missing_top_level:
  92. return False, errors
  93. # Validate metadata structure
  94. metadata_errors = self._validate_metadata(data.get('metadata', {}))
  95. errors.extend(metadata_errors)
  96. # Validate resources structure
  97. resources_errors = self._validate_resources(data.get('resources', {}))
  98. errors.extend(resources_errors)
  99. # Validate errors structure
  100. errors_field_errors = self._validate_errors_field(data.get('errors', []))
  101. errors.extend(errors_field_errors)
  102. is_valid = len(errors) == 0
  103. if is_valid:
  104. logger.info("Scan data validation passed")
  105. else:
  106. logger.warning(f"Scan data validation failed with {len(errors)} errors")
  107. return is_valid, errors
  108. def _validate_required_fields(
  109. self,
  110. data: Dict[str, Any],
  111. required_fields: List[str],
  112. context: str
  113. ) -> List[str]:
  114. """
  115. Validate that all required fields are present.
  116. Args:
  117. data: Dictionary to validate
  118. required_fields: List of required field names
  119. context: Context string for error messages
  120. Returns:
  121. List of error messages for missing fields
  122. """
  123. errors = []
  124. missing_fields = []
  125. for field in required_fields:
  126. if field not in data:
  127. missing_fields.append(field)
  128. if missing_fields:
  129. errors.append(
  130. f"Missing required {context} fields: {', '.join(missing_fields)}"
  131. )
  132. return errors
  133. def _validate_metadata(self, metadata: Any) -> List[str]:
  134. """
  135. Validate the metadata section of scan data.
  136. Args:
  137. metadata: Metadata dictionary to validate
  138. Returns:
  139. List of validation error messages
  140. Validates:
  141. - Property 2: metadata.account_id, metadata.scan_timestamp,
  142. metadata.regions_scanned, metadata.services_scanned
  143. """
  144. errors = []
  145. # Check if metadata is a dictionary
  146. if not isinstance(metadata, dict):
  147. errors.append("metadata must be a JSON object (dictionary)")
  148. return errors
  149. # Check required metadata fields
  150. missing_fields = []
  151. for field in self.REQUIRED_METADATA_FIELDS:
  152. if field not in metadata:
  153. missing_fields.append(field)
  154. if missing_fields:
  155. errors.append(
  156. f"Missing required metadata fields: {', '.join(missing_fields)}"
  157. )
  158. return errors
  159. # Validate field types
  160. if not isinstance(metadata.get('account_id'), str):
  161. errors.append("metadata.account_id must be a string")
  162. if not isinstance(metadata.get('scan_timestamp'), str):
  163. errors.append("metadata.scan_timestamp must be a string (ISO 8601 format)")
  164. else:
  165. # Validate timestamp format
  166. timestamp_error = self._validate_timestamp(metadata['scan_timestamp'])
  167. if timestamp_error:
  168. errors.append(timestamp_error)
  169. if not isinstance(metadata.get('regions_scanned'), list):
  170. errors.append("metadata.regions_scanned must be an array")
  171. if not isinstance(metadata.get('services_scanned'), list):
  172. errors.append("metadata.services_scanned must be an array")
  173. return errors
  174. def _validate_timestamp(self, timestamp: str) -> Optional[str]:
  175. """
  176. Validate ISO 8601 timestamp format.
  177. Args:
  178. timestamp: Timestamp string to validate
  179. Returns:
  180. Error message if invalid, None if valid
  181. """
  182. # Try parsing common ISO 8601 formats
  183. formats = [
  184. '%Y-%m-%dT%H:%M:%S.%fZ',
  185. '%Y-%m-%dT%H:%M:%SZ',
  186. '%Y-%m-%dT%H:%M:%S.%f+00:00',
  187. '%Y-%m-%dT%H:%M:%S+00:00',
  188. '%Y-%m-%dT%H:%M:%S.%f',
  189. '%Y-%m-%dT%H:%M:%S',
  190. ]
  191. for fmt in formats:
  192. try:
  193. datetime.strptime(timestamp, fmt)
  194. return None
  195. except ValueError:
  196. continue
  197. # Try fromisoformat as fallback (Python 3.7+)
  198. try:
  199. # Handle 'Z' suffix
  200. ts = timestamp.replace('Z', '+00:00')
  201. datetime.fromisoformat(ts)
  202. return None
  203. except ValueError:
  204. pass
  205. return f"metadata.scan_timestamp '{timestamp}' is not a valid ISO 8601 timestamp"
  206. def _validate_resources(self, resources: Any) -> List[str]:
  207. """
  208. Validate the resources section of scan data.
  209. Args:
  210. resources: Resources dictionary to validate
  211. Returns:
  212. List of validation error messages
  213. Validates:
  214. - Property 2: resources organized by service type
  215. """
  216. errors = []
  217. # Check if resources is a dictionary
  218. if not isinstance(resources, dict):
  219. errors.append("resources must be a JSON object (dictionary) organized by service type")
  220. return errors
  221. # Validate each service's resources
  222. for service, resource_list in resources.items():
  223. if not isinstance(resource_list, list):
  224. errors.append(f"resources.{service} must be an array of resources")
  225. continue
  226. # Validate each resource in the list
  227. for idx, resource in enumerate(resource_list):
  228. resource_errors = self._validate_resource(resource, service, idx)
  229. errors.extend(resource_errors)
  230. return errors
  231. def _validate_resource(
  232. self,
  233. resource: Any,
  234. service: str,
  235. index: int
  236. ) -> List[str]:
  237. """
  238. Validate a single resource entry.
  239. Args:
  240. resource: Resource dictionary to validate
  241. service: Service name for context
  242. index: Index in the resource list for context
  243. Returns:
  244. List of validation error messages
  245. """
  246. errors = []
  247. context = f"resources.{service}[{index}]"
  248. if not isinstance(resource, dict):
  249. errors.append(f"{context} must be a JSON object (dictionary)")
  250. return errors
  251. # Check required resource fields
  252. missing_fields = []
  253. for field in self.REQUIRED_RESOURCE_FIELDS:
  254. if field not in resource:
  255. missing_fields.append(field)
  256. if missing_fields:
  257. errors.append(
  258. f"{context} missing required fields: {', '.join(missing_fields)}"
  259. )
  260. # Validate attributes field if present (should be a dict)
  261. if 'attributes' in resource and not isinstance(resource['attributes'], dict):
  262. errors.append(f"{context}.attributes must be a JSON object (dictionary)")
  263. return errors
  264. def _validate_errors_field(self, errors_list: Any) -> List[str]:
  265. """
  266. Validate the errors section of scan data.
  267. Args:
  268. errors_list: Errors list to validate
  269. Returns:
  270. List of validation error messages
  271. Validates:
  272. - Property 2: errors list with error information
  273. """
  274. validation_errors = []
  275. # Check if errors is a list
  276. if not isinstance(errors_list, list):
  277. validation_errors.append("errors must be an array")
  278. return validation_errors
  279. # Validate each error entry
  280. for idx, error_entry in enumerate(errors_list):
  281. error_errors = self._validate_error_entry(error_entry, idx)
  282. validation_errors.extend(error_errors)
  283. return validation_errors
  284. def _validate_error_entry(self, error_entry: Any, index: int) -> List[str]:
  285. """
  286. Validate a single error entry.
  287. Args:
  288. error_entry: Error dictionary to validate
  289. index: Index in the errors list for context
  290. Returns:
  291. List of validation error messages
  292. """
  293. errors = []
  294. context = f"errors[{index}]"
  295. if not isinstance(error_entry, dict):
  296. errors.append(f"{context} must be a JSON object (dictionary)")
  297. return errors
  298. # Check required error fields
  299. missing_fields = []
  300. for field in self.REQUIRED_ERROR_FIELDS:
  301. if field not in error_entry:
  302. missing_fields.append(field)
  303. if missing_fields:
  304. errors.append(
  305. f"{context} missing required fields: {', '.join(missing_fields)}"
  306. )
  307. return errors
  308. def convert_to_scan_result(self, data: Dict[str, Any]) -> ScanResult:
  309. """
  310. Convert uploaded JSON data to a ScanResult object.
  311. This method transforms the uploaded scan data into a ScanResult object
  312. that is compatible with the existing ReportGenerator service.
  313. Args:
  314. data: Validated scan data dictionary
  315. Returns:
  316. ScanResult object compatible with ReportGenerator
  317. Requirements:
  318. - 5.1: Generate reports in the same format as existing scan tasks
  319. Note:
  320. This method assumes the data has already been validated using
  321. validate_scan_data(). Calling this with invalid data may raise
  322. exceptions.
  323. Validates:
  324. - Property 8: Report generation consistency - converts to same format
  325. used by credential-based scanning
  326. """
  327. metadata = data.get('metadata', {})
  328. resources_data = data.get('resources', {})
  329. errors_data = data.get('errors', [])
  330. # Create ScanResult with success=True (we have valid data)
  331. result = ScanResult(success=True)
  332. # Convert resources to ResourceData objects
  333. for service, resource_list in resources_data.items():
  334. for resource_dict in resource_list:
  335. resource = self._convert_resource(resource_dict)
  336. result.add_resource(service, resource)
  337. # Add errors
  338. for error_dict in errors_data:
  339. result.add_error(
  340. service=error_dict.get('service', 'unknown'),
  341. region=error_dict.get('region', 'unknown'),
  342. error=error_dict.get('error', 'Unknown error'),
  343. details=error_dict.get('details'),
  344. error_type=error_dict.get('error_type', 'Unknown'),
  345. )
  346. # Set metadata
  347. result.metadata = {
  348. 'account_id': metadata.get('account_id', ''),
  349. 'regions_scanned': metadata.get('regions_scanned', []),
  350. 'services_scanned': metadata.get('services_scanned', []),
  351. 'total_resources': sum(len(r) for r in result.resources.values()),
  352. 'total_errors': len(result.errors),
  353. 'scan_timestamp': metadata.get('scan_timestamp', ''),
  354. 'scanner_version': metadata.get('scanner_version', ''),
  355. 'source': 'upload', # Mark as uploaded data
  356. }
  357. logger.info(
  358. f"Converted scan data: {result.metadata['total_resources']} resources, "
  359. f"{result.metadata['total_errors']} errors"
  360. )
  361. return result
  362. def _convert_resource(self, resource_dict: Dict[str, Any]) -> ResourceData:
  363. """
  364. Convert a resource dictionary to a ResourceData object.
  365. Args:
  366. resource_dict: Resource dictionary from uploaded data
  367. Returns:
  368. ResourceData object
  369. """
  370. return ResourceData(
  371. account_id=resource_dict.get('account_id', ''),
  372. region=resource_dict.get('region', ''),
  373. service=resource_dict.get('service', ''),
  374. resource_type=resource_dict.get('resource_type', ''),
  375. resource_id=resource_dict.get('resource_id', ''),
  376. name=resource_dict.get('name', ''),
  377. attributes=resource_dict.get('attributes', {}),
  378. )